Question

我写了一些代码，第一部分我完全收获了一些标题工作（包含90个值），但第二部分价格a保留了一个不完整的列表（包含30个）。它似乎循环不适用于该细分市场如何更改此代码以保持完整列表？
提前谢谢！

import re
import requests
from bs4 import BeautifulSoup

url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"


DATA_CONTAINER = list()
DATA = list()

def collectData():

    global DATA_CONTAINER
    global DATA


    for i in range(1, 5):
        newUrl = url + "&sort=20a&page=" + str(i) 
        r = requests.get(newUrl)
        soup = BeautifulSoup(r.content, "lxml")
        #print(soup)
        g_data_odd = soup.find_all("td", {"class": "productListing-data"})
        for item in g_data_odd:         
            t = item.find_all("div", {"class": "product_name"})
            i = list()
            for name in t:
                piece = name.find('a').text
                i.append(piece)
                #print(piece)
                # for pc in piece:
                #   i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
                #   print(pc)
                DATA_CONTAINER.append(piece)

        spans = soup.find_all('span', {"class": "productSalePrice"})
        # create a list of lines corresponding to element texts
        lines = [span.get_text() for span in spans]
        # collect the dates from the list of lines using regex matching groups
        found_dates = []
        for line in lines:
            m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
            if m:
                found_dates.append(str(m.group(0)))
                # print the dates we collected
        # for date in found_dates:
        #     print(date)

        # DATA_J = DATA_CONTAINER[:]
        DATA = list(zip(DATA_CONTAINER, found_dates))
        print(DATA)

def serializeToCSV(fileName):
    with open(fileName, "w") as fd:
        for item in DATA:
            fd.write(u' '.join(item).encode('utf-8') + "\n")

collectData()
print(len(DATA))
serializeToCSV('csv.csv')

Answer 1

尝试this code：

import re
import requests
from bs4 import BeautifulSoup

url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"


DATA_CONTAINER = list()
DATA = list()

def collectData():

    global DATA_CONTAINER
    global DATA


    for i in range(1, 5):
        newUrl = url + "&sort=20a&page=" + str(i) 
        r = requests.get(newUrl)
        soup = BeautifulSoup(r.content, "lxml")
        #print(soup)
        g_data_odd = soup.find_all("td", {"class": "productListing-data"})
        for item in g_data_odd:         
            t = item.find_all("div", {"class": "product_name"})
            i = list()
            for name in t:
                piece = name.find('a').text
                i.append(piece)
                #print(piece)
                # for pc in piece:
                #   i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
                #   print(pc)
                DATA_CONTAINER.append(piece)

        spans = soup.find_all('span', {"class": "productSalePrice"})
        # create a list of lines corresponding to element texts
        lines = [span.get_text() for span in spans]
        # collect the dates from the list of lines using regex matching groups
        found_dates = []
        for line in lines:
            m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
            if m:
                found_dates.append(str(m.group(0)))
                # print the dates we collected
        # for date in found_dates:
        #     print(date)

        # DATA_J = DATA_CONTAINER[:]
        DATA = list(zip(DATA_CONTAINER, found_dates))
        print(DATA)

        def serializeToCSV(fileName):
            with open(fileName, "a") as fd:
                for item in DATA:
                   fd.write(u' '.join(str(item)) + "\n")

            print(len(DATA))
        serializeToCSV('csv.csv')

collectData()

第53行使用"a"选项编写文件
在循环中调用方法serializeToCSV（第17行）

bs4解析器保留不完整的列表

1 个答案: