bs4解析器保留不完整的列表

时间:2017-02-21 19:13:57

标签: python-3.x parsing web-scraping bs4

我写了一些代码,第一部分我完全收获了一些标题工作(包含90个值),但第二部分价格a保留了一个不完整的列表(包含30个)。它似乎循环不适用于该细分市场 如何更改此代码以保持完整列表?
提前谢谢!

import re
import requests
from bs4 import BeautifulSoup

url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"


DATA_CONTAINER = list()
DATA = list()

def collectData():

    global DATA_CONTAINER
    global DATA


    for i in range(1, 5):
        newUrl = url + "&sort=20a&page=" + str(i) 
        r = requests.get(newUrl)
        soup = BeautifulSoup(r.content, "lxml")
        #print(soup)
        g_data_odd = soup.find_all("td", {"class": "productListing-data"})
        for item in g_data_odd:         
            t = item.find_all("div", {"class": "product_name"})
            i = list()
            for name in t:
                piece = name.find('a').text
                i.append(piece)
                #print(piece)
                # for pc in piece:
                #   i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
                #   print(pc)
                DATA_CONTAINER.append(piece)

        spans = soup.find_all('span', {"class": "productSalePrice"})
        # create a list of lines corresponding to element texts
        lines = [span.get_text() for span in spans]
        # collect the dates from the list of lines using regex matching groups
        found_dates = []
        for line in lines:
            m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
            if m:
                found_dates.append(str(m.group(0)))
                # print the dates we collected
        # for date in found_dates:
        #     print(date)

        # DATA_J = DATA_CONTAINER[:]
        DATA = list(zip(DATA_CONTAINER, found_dates))
        print(DATA)

def serializeToCSV(fileName):
    with open(fileName, "w") as fd:
        for item in DATA:
            fd.write(u' '.join(item).encode('utf-8') + "\n")

collectData()
print(len(DATA))
serializeToCSV('csv.csv')

1 个答案:

答案 0 :(得分:1)

尝试this code

import re
import requests
from bs4 import BeautifulSoup

url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"


DATA_CONTAINER = list()
DATA = list()

def collectData():

    global DATA_CONTAINER
    global DATA


    for i in range(1, 5):
        newUrl = url + "&sort=20a&page=" + str(i) 
        r = requests.get(newUrl)
        soup = BeautifulSoup(r.content, "lxml")
        #print(soup)
        g_data_odd = soup.find_all("td", {"class": "productListing-data"})
        for item in g_data_odd:         
            t = item.find_all("div", {"class": "product_name"})
            i = list()
            for name in t:
                piece = name.find('a').text
                i.append(piece)
                #print(piece)
                # for pc in piece:
                #   i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
                #   print(pc)
                DATA_CONTAINER.append(piece)

        spans = soup.find_all('span', {"class": "productSalePrice"})
        # create a list of lines corresponding to element texts
        lines = [span.get_text() for span in spans]
        # collect the dates from the list of lines using regex matching groups
        found_dates = []
        for line in lines:
            m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
            if m:
                found_dates.append(str(m.group(0)))
                # print the dates we collected
        # for date in found_dates:
        #     print(date)

        # DATA_J = DATA_CONTAINER[:]
        DATA = list(zip(DATA_CONTAINER, found_dates))
        print(DATA)

        def serializeToCSV(fileName):
            with open(fileName, "a") as fd:
                for item in DATA:
                   fd.write(u' '.join(str(item)) + "\n")

            print(len(DATA))
        serializeToCSV('csv.csv')

collectData()
  1. 第53行使用"a"选项编写文件
  2. 在循环中调用方法serializeToCSV(第17行)