我想提取整个页面,但是最后一行提出不上,使用PageImage得到如下图:
以下是源代码:
def pdf_to_df(file_path,start:int,end:int, del_cols:[]):
'''
params:
file_path:需要提取表格的pdf文件的绝对路径
start:出现表格的起始页码
end:表格结束页码
del_cols:需要移除的列
'''
with pdfplumber.open(file_path) as pdf1:
df_result = pd.DataFrame()
for i in range(start-1, end):
page = pdf1.pages[i]
# 参数设计
table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"explicit_vertical_lines": [],
"explicit_horizontal_lines": [],
"snap_tolerance": 0,
"snap_x_tolerance": 0,
"snap_y_tolerance": 0,
"join_tolerance": 0,
"join_x_tolerance": 0,
"join_y_tolerance": 0,
"edge_min_length": 0,
"min_words_vertical": 0,
"min_words_horizontal": 0,
"text_tolerance": 1,
"text_x_tolerance": 1,
"text_y_tolerance": 1,
"intersection_tolerance": 0,
"intersection_x_tolerance": 0,
"intersection_y_tolerance": 50,
}
# 自动读取表格信息,返回列表
tables = page.extract_table(table_settings)
df_result = pd.concat([df_result, pd.DataFrame(tables)])
im = page.to_image(resolution=150)
im.reset().debug_tablefinder(table_settings) # 显示表格
im.show()
df_result.drop_duplicates(inplace=True) # 去除重复行
df_result = df_result.drop(del_cols,axis=1) # 删除第1,3列空值
return df_result
相似问题