我是编程新手。我正在写一个从pdf中提取数据的Python脚本。我在使用tuple时遇到了问题。我不能提供它的论据。我认为这是我的逻辑不正确,包括缩进,序列,或其他东西。
我希望得到一些关于为什么我得到这个错误的解释。
这是我的PDF示例(我必须屏蔽一些敏感信息)
这就是我想要达到的目标
我得到了这个错误:
Traceback (most recent call last):
File "/Users/jeff/PycharmProjects/extractFreightInvoice/main.py", line 79, in <module>
lines.append(Line('invDate, invNumber, poNumber, contactName, jobNumber, '
TypeError: <lambda>() missing 11 required positional arguments: 'invNumber', 'poNumber', 'contactName', 'jobNumber', 'jobName', 'invDescription', 'siteAddress', 'invItemsDesc', 'invItemsQty', 'invItemsUnitPrice', and 'invItemsAmount'
我的代码如下:
# This is a pdf extractor
import re
import pdfplumber
import pandas as pd
from collections import namedtuple
Line = namedtuple('Line', 'invDate, invNumber, poNumber, contactName, jobNumber, '
'jobName, invDescription, siteAddress, invItemsDesc, invItemsQty, invItemsUnitPrice, '
'invItemsAmount ')
invDate_re = re.compile(r'(Clever Core NZ Limited\s)(\d{1,2}/\d{1,2}/\d{4})(.+)')
invNumber_re = re.compile(r'(IN\d{6})')
poNumber_re = re.compile(r'\d{4}')
contactNameBen_re = re.compile(r'(Jordan\s.+)')
contactNameCraig_re = re.compile(r'(Lorna\s.+)')
jobNumber_re = re.compile(r'(J[\d]{6})')
jobName_re = re.compile(r'(Job Name)')
invDescription_re = re.compile(r'(Invoice Description)')
siteAddress_re = re.compile(r'(Site address.*)')
colHeading_re = re.compile(r'((Description)(.* Quantity.* Unit Price.*))')
invItems_re = re.compile(
r'(.+) (([0-9]*[.])?[0-9]+) (([0-9]*[.])?[0-9]+) (\d*\?\d+|\d{1,3}(,\d{3})*(\.\d+)?)')
# quoteLines_re = re.compile(r'(.+)(:\s*)(.+)')
# clevercorePriceLine_re = re.compile(r'(.* First .*\s?)(-\s?.*\$)(\s*)(.+)')
file = 'CombinedInvoicePdf.pdf'
lines = []
with pdfplumber.open(file) as myPdf:
for page in myPdf.pages:
text = page.extract_text()
lines = text.split('\n')
index = 0
for i in range(len(lines)):
line = lines[i]
invDateLine = invDate_re.search(line)
invNumberLine = invNumber_re.search(line)
poNumberLine = poNumber_re.search(line)
contactNameJordanLine = contactNameJordan_re.search(line)
contactNameLornaLine = contactNameLorna_re.search(line)
jobNumberLine = jobNumber_re.search(line)
jobNameLine = jobName_re.search(line)
invDescriptionLine = invDescription_re.search(line)
colHeadingLine = colHeading_re.search(line)
siteAddressLine = siteAddress_re.search(line)
invItemsLine = invItems_re.search(line)
if invDateLine:
invDate = invDateLine.group(2)
if invNumberLine:
invNumber = invNumberLine.group(1)
if poNumberLine and len(line) == 4:
poNumber = poNumberLine.group(0)
if contactNameBenLine:
contactName = 'Jordan Michael'
if contactNameCraigLine:
contactName = 'Lorna Tolentin'
if jobNumberLine:
jobNumber = lines[i]
if jobNameLine:
jobName = (lines[i + 1])
if invDescriptionLine:
invDescription = lines[i + 1]
if siteAddressLine:
if len(lines[i + 1]) > 0 and len(lines[i + 1]) == 0:
siteAddress = lines[i + 1]
elif len(lines[i + 1]) > 0 and len(lines[i + 1]) > 0:
siteAddress = lines[i + 1] + ' ' + lines[i + 2]
else:
siteAddress = 'check invoice'
if invItemsLine and invItemsLine[2] != '06':
invItemsDesc = invItemsLine.group(1)
invItemsQty = invItemsLine.group(2)
invItemsUnitPrice = invItemsLine.group(4)
invItemsAmount = invItemsLine.group(6)
lines.append(Line('invDate, invNumber, poNumber, contactName, jobNumber, '
'jobName, invDescription, siteAddress, invItemsDesc, invItemsQty, invItemsUnitPrice, '
'inItemsAmount'))
df = pd.DataFrame(lines)
print(df)
print(df.head())
df.to_csv('freightCharges.csv')
发布于 2021-07-18 06:08:23
Line
是一个带有参数和字段的tuple
子类
您需要使用单独的参数来填充它们,而不是使用单个字符串
lines.append(Line('invDate', 'invNumber', 'poNumber', 'contactName', 'jobNumber', 'jobName', 'invDescription',
'siteAddress', 'invItemsDesc', 'invItemsQty', 'invItemsUnitPrice', 'inItemsAmount'))
https://stackoverflow.com/questions/68426420
复制相似问题