这是我的代码,我需要将for循环中的所有数据帧连接到一个数据帧中。我需要将数据帧写入csv文件。
import urllib
from bs4 import BeautifulSoup
import pandas as pd
for i in range(2,636):
soup = BeautifulSoup(urllib.urlopen("https://www.wisdomjobs.com/core-java-
jobs-" + str(i), "lxml").read())
#print soup
all_tables = soup.find_all('table')
#print all_tables
A=[]
B=[]
C=[]
for right_table in all_tables:
for row in right_table.findAll("tr"):
cells = row.findAll('td')
states=row.findAll('th') #To store second column data
if len(cells)==3: #Only extract table body not heading
A.append(cells[0].find(text=True))
B.append(cells[1].find(text=True))
C.append(cells[2].find(text=True))
#print "\n Generate lists ", A, B, C
df = pd.DataFrame(A, columns=["Company and Job Title"])
df["Location"] = B
df['Posted On'] = C
df
#print '\n table', df
#df.to_csv('core_java2.csv')
with open('foo.csv', 'a') as f:
pd.concat([df], axis=1).to_csv(f, header=False)
答案 0 :(得分:0)
我相信你可以在循环中使用read_html
,将输出附加到list
,按concat
创建最终DataFrame
,然后按to_csv
写入文件:
dfs = []
for i in range(2,636):
df = pd.read_html('https://www.wisdomjobs.com/core-java-jobs-' + str(i))[1]
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)
print (df.head())
df.to_csv('foo.csv')
如果需要修改您的解决方案,请将df
循环添加到list
和最后concat
:
dfs = []
for i in range(2,636):
soup = BeautifulSoup(urllib.urlopen("https://www.wisdomjobs.com/core-java-
jobs-" + str(i), "lxml").read())
#print soup
all_tables = soup.find_all('table')
#print all_tables
A=[]
B=[]
C=[]
for right_table in all_tables:
for row in right_table.findAll("tr"):
cells = row.findAll('td')
states=row.findAll('th') #To store second column data
if len(cells)==3: #Only extract table body not heading
A.append(cells[0].find(text=True))
B.append(cells[1].find(text=True))
C.append(cells[2].find(text=True))
#print "\n Generate lists ", A, B, C
df = pd.DataFrame(A, columns=["Company and Job Title"])
df["Location"] = B
df['Posted On'] = C
dfs.append(df)
#print '\n table', df
#df.to_csv('core_java2.csv')
df = pd.concat(dfs, ignore_index=True)
print (df.head())
df.to_csv('foo.csv')