如何在我的python代码中使用list:content的内容输出?

时间:2016-12-28 06:47:34

标签: python web web-crawler

我一直在为this website开发一个python web-crawler。我做了两个功能,分别很好用。

一个是收集股票清单和

另一个是收集每个列表的内容数据。

我想用

对输出我的代码

"list#1/content#1"

"list#2/content#2"

"list#3/content#3"

为了实现这一目标,需要在我的代码中修改哪些内容?

感谢。

from bs4 import BeautifulSoup
import urllib.request

CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
BASE_PAGE = 'http://www.bobaedream.co.kr'

def fetch_post_list():

    for i in range(20,21):
        URL = CAR_PAGE_TEMPLATE + str(i)
        res = urllib.request.urlopen(URL)
        html = res.read()
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', class_='cyber')
        #print ("Page#", i)

        # 50 lists per each page
        lists=table.find_all('tr', itemtype="http://schema.org/Article")

        count=0
        for lst in lists:
            if lst.find_all('td')[3].find('em').text:
                lst_price=lst.find_all('td')[3].find('em').text
                lst_title=lst.find_all('td')[1].find('a').text
                lst_link = lst.find_all('td')[1].find('a')['href']
                lst_photo_url=''
                if lst.find_all('td')[0].find('img'):
                    lst_photo_url = lst.find_all('td')[0].find('img')['src']
                count+=1
            else: continue

            #print('#',count, lst_title, lst_photo_url, lst_link, lst_price)

    return lst_link

def fetch_post_content(lst_link):

    URL = BASE_PAGE + lst_link
    res = urllib.request.urlopen(URL)
    html = res.read()
    soup = BeautifulSoup(html, 'html.parser')

    #Basic Information
    table = soup.find('div', class_='rightarea')

    # Number, Year, Mileage, Gas Type, Color, Accident
    content_table1 = table.find_all('div')[0]
    dds = content_table1.find_all('dd')
    for dd in dds:
        car_span_t = dd.find_all('span', {'class': 't'})[0]
        car_span_s = dd.find_all('span', {'class': 's'})[0]
        #print(car_span_t.text, ':', car_span_s.text)

    # Seller Information
    content_table2 = table.find_all('div')[1]
    dds2 = content_table2.find_all('dd')
    for dd2 in dds2:
        seller_span_t = dd.find_all('span', {'class': 't'})[0]
        seller_span_s = dd.find_all('span', {'class': 's'})[0]
        #print(seller_span_t.text, ':', seller_span_s.text)

    return dds

0 个答案:

没有答案