Question

这是我的代码，我需要将for循环中的所有数据帧连接到一个数据帧中。我需要将数据帧写入csv文件。

import urllib
from bs4 import BeautifulSoup
import pandas as pd

for i in range(2,636):
  soup = BeautifulSoup(urllib.urlopen("https://www.wisdomjobs.com/core-java-
  jobs-" + str(i), "lxml").read())
  #print soup
  all_tables = soup.find_all('table')
  #print all_tables
  A=[]
  B=[]
  C=[]
  for right_table in all_tables:
    for row in right_table.findAll("tr"):
        cells = row.findAll('td')
        states=row.findAll('th') #To store second column data
        if len(cells)==3: #Only extract table body not heading
            A.append(cells[0].find(text=True))
            B.append(cells[1].find(text=True))
            C.append(cells[2].find(text=True))
  #print "\n Generate lists ", A, B, C
  df = pd.DataFrame(A, columns=["Company and Job Title"])
  df["Location"] = B
  df['Posted On'] = C
  df
  #print '\n table', df
  #df.to_csv('core_java2.csv')
  with open('foo.csv', 'a') as f:
     pd.concat([df], axis=1).to_csv(f, header=False)

Answer 1

我相信你可以在循环中使用read_html，将输出附加到list，按concat创建最终DataFrame，然后按to_csv写入文件：

dfs = []
for i in range(2,636):
    df = pd.read_html('https://www.wisdomjobs.com/core-java-jobs-' + str(i))[1]
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
print (df.head())


df.to_csv('foo.csv')

如果需要修改您的解决方案，请将df循环添加到list和最后concat：

dfs = []
for i in range(2,636):
  soup = BeautifulSoup(urllib.urlopen("https://www.wisdomjobs.com/core-java-
  jobs-" + str(i), "lxml").read())
  #print soup
  all_tables = soup.find_all('table')
  #print all_tables
  A=[]
  B=[]
  C=[]
  for right_table in all_tables:
    for row in right_table.findAll("tr"):
        cells = row.findAll('td')
        states=row.findAll('th') #To store second column data
        if len(cells)==3: #Only extract table body not heading
            A.append(cells[0].find(text=True))
            B.append(cells[1].find(text=True))
            C.append(cells[2].find(text=True))
  #print "\n Generate lists ", A, B, C
  df = pd.DataFrame(A, columns=["Company and Job Title"])
  df["Location"] = B
  df['Posted On'] = C
  dfs.append(df)
  #print '\n table', df
  #df.to_csv('core_java2.csv')

df = pd.concat(dfs, ignore_index=True)
print (df.head())

df.to_csv('foo.csv')

如何将我的所有数据帧一个接一个地连接到for循环中的一个数据帧中？

1 个答案: