Question

（更新）经过一些帮助后，我现在有了以下代码。我可以输出到csv文件，但我似乎无法让csv拥有适当数量的列：

soup = BeautifulSoup(html_doc)
import csv
outfile=csv.writer(open('outputrows.csv','wb'),delimiter='\t')
#def get_movie_info(imdb):
tbl = soup.find('table')
rows = tbl.findAll('tr')
list=[]
for row in rows:
    cols = row.find_all('td')
    for col in cols:
        if col.has_attr('class') and col['class'][0] == 'title':
            spans = col.find_all('span')
            for span in spans:
                if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
                    ID = span.get('data-tconst')
                    list.append(ID)
        elif col.has_attr('class') and col['class'][0] == 'number':
            rank = col.text
            list.append(rank)            
        elif col.has_attr('class') and col['class'][0] == 'image':
            hrefs = col.find_all('a')
            for href in hrefs:
                moviename = href.get('title')
                list.append(moviename)

outfile.writerows(list)

print list

问题在于它以这种格式输出，这只是一列数据：

1.
The Shawshank Redemption (1994)
tt0111161
2.
The Dark Knight (2008)
tt0468569
3.
Inception (2010)
tt1375666

当我需要3列数据时如下所示：

1.   The Shawshank Redemption (1994)   tt0111161
2.   The Dark Knight (2008)   tt0468569
3.   Inception (2010)   tt1375666

示例HTML代码：

 <tr class="odd detailed">
     <td class="number">
      48.
     </td>
     <td class="image">
      <a href="/title/tt0082971/" title="Raiders of the Lost Ark (1981)">
       <img alt="Raiders of the Lost Ark (1981)" height="74" src="http://ia.media-imdb.com/images/M/MV5BMjA0ODEzMTc1Nl5BMl5BanBnXkFtZTcwODM2MjAxNA@@._V1._SX54_CR0,0,54,74_.jpg" title="Raiders of the Lost Ark (1981)" width="54"/>
      </a>
     </td>
     <td class="title">
      <span class="wlb_wrapper" data-caller-name="search" data-size="small" data-tconst="tt0082971">
      </span>
      <a href="/title/tt0082971/">
       Raiders of the Lost Ark
      </a>
      <span class="year_type">
       (1981)
      </span>
      <br/>

Answer 1

您是否尝试从get_movie_info函数返回打印行的列表？

def get_movie_info():
  returnedRows = []
  tbl = soup.find('table')
  rows = tbl.findAll('tr')
  for row in rows:
     cols = row.find_all('td')
     for col in cols:
        if col.has_attr('class') and col['class'][0] == 'image':
            hrefs = col.find_all('a')
            for href in hrefs:
                print href.get('title')
                returnedRows.append(href.get('title'))             # <-- append 'title' 
        elif col.has_attr('class') and col['class'][0] == 'title':
            spans = col.find_all('span')
            for span in spans:
                if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
                    print span.get('data-tconst')
                    returnedRows.append(span.get('data-tconst'))   # <-- append 'tconst'
        elif col.has_attr('class') and col['class'][0] == 'number':
            print col.text
            returnedRows.append(col.text)                          # <-- append 'number'
   return returnedRows                                             # <-- then return the list

以这种方式执行，

import csv
outfile=csv.writer(open('outputrows.tsv','wb'),delimiter='\t')
rows=get_movie_info()
outfile.writerows(rows)

Answer 2

请你试试这个（不是优化的解决方案，但应该做的）：

soup = BeautifulSoup(html_doc)

def get_movie_info():
  tbl = soup.find('table')
  rows = tbl.findAll('tr')
  for row in rows:
    (imageTitle, dataTConst, number) = ('', '', '')
    cols = row.find_all('td')
    for col in cols:
        if col.has_attr('class') and col['class'][0] == 'image':
            href = col.find('a')
            imageTitle = href.get('title')
        elif col.has_attr('class') and col['class'][0] == 'title':
            span = col.find('span')
            if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
                dataTConst = span.get('data-tconst')
        elif col.has_attr('class') and col['class'][0] == 'number':
            number = col.text

    yield (imageTitle, dataTConst, number)

#################################################
import csv
outfile=csv.writer(open('outputrows.csv','wb'), delimiter='\t')
for row in get_movie_info():
    outfile.writerow(row)

Answer 3

这是一种简单的方法：

#!/usr/bin/env python

import pandas as pd
import BeautifulSoup as BeautifulSoup
import requests

url = 'some_url.html'
r=requests.get(url)

movie_id=[]
title=[]
year=[]

bs = BeautifulSoup(r.text)
for movie in bs.findAll('td', 'title'):
    movie_id.append((movie.find('a').get('href')).split('/')[2])
    title.append(movie.find('a').contents[0])
    year.append(movie.find('span', 'year_type').contents[0])

movie_dic={'movie_id': movie_id, 'title': title, 'year': year}
movie_data = pd.DataFrame(movie_dic, index = None)

file_name = "~/movies.txt"
movie_data.to_csv(file_name, sep = ',', header = True, encoding = 'utf-8', mode = 'w')

如何将解析后的HTML输出到文件中？

3 个答案: