我一直在为this website开发一个python web-crawler。我做了两个功能,分别很好用。
一个是收集股票清单和
另一个是收集每个列表的内容数据。
我想用
对输出我的代码 "list#1/content#1"
,
"list#2/content#2"
,
"list#3/content#3"
,
为了实现这一目标,需要在我的代码中修改哪些内容?
感谢。
from bs4 import BeautifulSoup
import urllib.request
CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
BASE_PAGE = 'http://www.bobaedream.co.kr'
def fetch_post_list():
for i in range(20,21):
URL = CAR_PAGE_TEMPLATE + str(i)
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='cyber')
#print ("Page#", i)
# 50 lists per each page
lists=table.find_all('tr', itemtype="http://schema.org/Article")
count=0
for lst in lists:
if lst.find_all('td')[3].find('em').text:
lst_price=lst.find_all('td')[3].find('em').text
lst_title=lst.find_all('td')[1].find('a').text
lst_link = lst.find_all('td')[1].find('a')['href']
lst_photo_url=''
if lst.find_all('td')[0].find('img'):
lst_photo_url = lst.find_all('td')[0].find('img')['src']
count+=1
else: continue
#print('#',count, lst_title, lst_photo_url, lst_link, lst_price)
return lst_link
def fetch_post_content(lst_link):
URL = BASE_PAGE + lst_link
res = urllib.request.urlopen(URL)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
#Basic Information
table = soup.find('div', class_='rightarea')
# Number, Year, Mileage, Gas Type, Color, Accident
content_table1 = table.find_all('div')[0]
dds = content_table1.find_all('dd')
for dd in dds:
car_span_t = dd.find_all('span', {'class': 't'})[0]
car_span_s = dd.find_all('span', {'class': 's'})[0]
#print(car_span_t.text, ':', car_span_s.text)
# Seller Information
content_table2 = table.find_all('div')[1]
dds2 = content_table2.find_all('dd')
for dd2 in dds2:
seller_span_t = dd.find_all('span', {'class': 't'})[0]
seller_span_s = dd.find_all('span', {'class': 's'})[0]
#print(seller_span_t.text, ':', seller_span_s.text)
return dds