(更新)经过一些帮助后,我现在有了以下代码。我可以输出到csv文件,但我似乎无法让csv拥有适当数量的列:
soup = BeautifulSoup(html_doc)
import csv
outfile=csv.writer(open('outputrows.csv','wb'),delimiter='\t')
#def get_movie_info(imdb):
tbl = soup.find('table')
rows = tbl.findAll('tr')
list=[]
for row in rows:
cols = row.find_all('td')
for col in cols:
if col.has_attr('class') and col['class'][0] == 'title':
spans = col.find_all('span')
for span in spans:
if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
ID = span.get('data-tconst')
list.append(ID)
elif col.has_attr('class') and col['class'][0] == 'number':
rank = col.text
list.append(rank)
elif col.has_attr('class') and col['class'][0] == 'image':
hrefs = col.find_all('a')
for href in hrefs:
moviename = href.get('title')
list.append(moviename)
outfile.writerows(list)
print list
问题在于它以这种格式输出,这只是一列数据:
1.
The Shawshank Redemption (1994)
tt0111161
2.
The Dark Knight (2008)
tt0468569
3.
Inception (2010)
tt1375666
当我需要3列数据时如下所示:
1. The Shawshank Redemption (1994) tt0111161
2. The Dark Knight (2008) tt0468569
3. Inception (2010) tt1375666
示例HTML代码:
<tr class="odd detailed">
<td class="number">
48.
</td>
<td class="image">
<a href="/title/tt0082971/" title="Raiders of the Lost Ark (1981)">
<img alt="Raiders of the Lost Ark (1981)" height="74" src="http://ia.media-imdb.com/images/M/MV5BMjA0ODEzMTc1Nl5BMl5BanBnXkFtZTcwODM2MjAxNA@@._V1._SX54_CR0,0,54,74_.jpg" title="Raiders of the Lost Ark (1981)" width="54"/>
</a>
</td>
<td class="title">
<span class="wlb_wrapper" data-caller-name="search" data-size="small" data-tconst="tt0082971">
</span>
<a href="/title/tt0082971/">
Raiders of the Lost Ark
</a>
<span class="year_type">
(1981)
</span>
<br/>
答案 0 :(得分:1)
您是否尝试从get_movie_info
函数返回打印行的列表?
def get_movie_info():
returnedRows = []
tbl = soup.find('table')
rows = tbl.findAll('tr')
for row in rows:
cols = row.find_all('td')
for col in cols:
if col.has_attr('class') and col['class'][0] == 'image':
hrefs = col.find_all('a')
for href in hrefs:
print href.get('title')
returnedRows.append(href.get('title')) # <-- append 'title'
elif col.has_attr('class') and col['class'][0] == 'title':
spans = col.find_all('span')
for span in spans:
if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
print span.get('data-tconst')
returnedRows.append(span.get('data-tconst')) # <-- append 'tconst'
elif col.has_attr('class') and col['class'][0] == 'number':
print col.text
returnedRows.append(col.text) # <-- append 'number'
return returnedRows # <-- then return the list
以这种方式执行,
import csv
outfile=csv.writer(open('outputrows.tsv','wb'),delimiter='\t')
rows=get_movie_info()
outfile.writerows(rows)
答案 1 :(得分:1)
请你试试这个(不是优化的解决方案,但应该做的):
soup = BeautifulSoup(html_doc)
def get_movie_info():
tbl = soup.find('table')
rows = tbl.findAll('tr')
for row in rows:
(imageTitle, dataTConst, number) = ('', '', '')
cols = row.find_all('td')
for col in cols:
if col.has_attr('class') and col['class'][0] == 'image':
href = col.find('a')
imageTitle = href.get('title')
elif col.has_attr('class') and col['class'][0] == 'title':
span = col.find('span')
if span.has_attr('class') and span['class'][0] == 'wlb_wrapper':
dataTConst = span.get('data-tconst')
elif col.has_attr('class') and col['class'][0] == 'number':
number = col.text
yield (imageTitle, dataTConst, number)
#################################################
import csv
outfile=csv.writer(open('outputrows.csv','wb'), delimiter='\t')
for row in get_movie_info():
outfile.writerow(row)
答案 2 :(得分:1)
这是一种简单的方法:
#!/usr/bin/env python
import pandas as pd
import BeautifulSoup as BeautifulSoup
import requests
url = 'some_url.html'
r=requests.get(url)
movie_id=[]
title=[]
year=[]
bs = BeautifulSoup(r.text)
for movie in bs.findAll('td', 'title'):
movie_id.append((movie.find('a').get('href')).split('/')[2])
title.append(movie.find('a').contents[0])
year.append(movie.find('span', 'year_type').contents[0])
movie_dic={'movie_id': movie_id, 'title': title, 'year': year}
movie_data = pd.DataFrame(movie_dic, index = None)
file_name = "~/movies.txt"
movie_data.to_csv(file_name, sep = ',', header = True, encoding = 'utf-8', mode = 'w')