我想要从这个文本文件text_file中抓取表格,而我想要的表格是SUMMARY CONSOLIDATED FINANCIAL AND OTHER DATA。BeautifulSoup.content给我的代码看起来像这样的The Origin Code。我的代码是附加的,有人能告诉我哪里出了问题吗?
url = r'https://www.sec.gov/Archives/edgar/data/1181232/000104746903038553/a2123752z424b4.htm'
filing_url = requests.get(url)
content = filing_url.text
soup = BeautifulSoup(content, 'lxml')
tables = soup.find_all(text=re.compile('SUMMARY CONSOLIDATED FINANCIAL AND OTHER DATA'))
n_columns = 0
n_rows = 0
column_names = []
for table in tables:
for row in table.find_next('table').find_all('tr'):
# Determine the number of rows in the table
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows += 1
if n_columns == 0:
# Set the number of columns for the table
n_columns = len(td_tags)
# Handle column names if find them
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
raise Exception("Column titles do not match the number of columns")
columns = column_names if len(column_names) > 0 else range(0, n_columns)
df = pd.DataFrame(columns=columns,
index=range(0, n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker, column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
print(df)
发布于 2020-05-17 15:39:24
在这种情况下,您可以使用pandas将其大大简化:
import pandas as pd
url = 'https://www.sec.gov/Archives/edgar/data/1181232/000104746903038553/a2123752z424b4.htm'
tables = pd.read_html(url)
#there are more than 100 tables on that page, so you have to narrow it down
targets = []
for t in tables:
if 'Unaudited' in str(t.columns):
targets.append(t)
targets[0] #only two meet that requirement, and the first is your target
输出是您的目标表。
https://stackoverflow.com/questions/61852584
复制相似问题