我写了一些代码,第一部分我完全收获了一些标题工作(包含90个值),但第二部分价格a保留了一个不完整的列表(包含30个)。它似乎循环不适用于该细分市场
如何更改此代码以保持完整列表?
提前谢谢!
import re
import requests
from bs4 import BeautifulSoup
url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"
DATA_CONTAINER = list()
DATA = list()
def collectData():
global DATA_CONTAINER
global DATA
for i in range(1, 5):
newUrl = url + "&sort=20a&page=" + str(i)
r = requests.get(newUrl)
soup = BeautifulSoup(r.content, "lxml")
#print(soup)
g_data_odd = soup.find_all("td", {"class": "productListing-data"})
for item in g_data_odd:
t = item.find_all("div", {"class": "product_name"})
i = list()
for name in t:
piece = name.find('a').text
i.append(piece)
#print(piece)
# for pc in piece:
# i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
# print(pc)
DATA_CONTAINER.append(piece)
spans = soup.find_all('span', {"class": "productSalePrice"})
# create a list of lines corresponding to element texts
lines = [span.get_text() for span in spans]
# collect the dates from the list of lines using regex matching groups
found_dates = []
for line in lines:
m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
if m:
found_dates.append(str(m.group(0)))
# print the dates we collected
# for date in found_dates:
# print(date)
# DATA_J = DATA_CONTAINER[:]
DATA = list(zip(DATA_CONTAINER, found_dates))
print(DATA)
def serializeToCSV(fileName):
with open(fileName, "w") as fd:
for item in DATA:
fd.write(u' '.join(item).encode('utf-8') + "\n")
collectData()
print(len(DATA))
serializeToCSV('csv.csv')
答案 0 :(得分:1)
尝试this code:
import re
import requests
from bs4 import BeautifulSoup
url = "http://www.watcheszon.com/casio-g-shock-c-19_20/?pg=store"
DATA_CONTAINER = list()
DATA = list()
def collectData():
global DATA_CONTAINER
global DATA
for i in range(1, 5):
newUrl = url + "&sort=20a&page=" + str(i)
r = requests.get(newUrl)
soup = BeautifulSoup(r.content, "lxml")
#print(soup)
g_data_odd = soup.find_all("td", {"class": "productListing-data"})
for item in g_data_odd:
t = item.find_all("div", {"class": "product_name"})
i = list()
for name in t:
piece = name.find('a').text
i.append(piece)
#print(piece)
# for pc in piece:
# i.append(pc.replace("\r", "").replace("\n", "").replace("\t", ""))
# print(pc)
DATA_CONTAINER.append(piece)
spans = soup.find_all('span', {"class": "productSalePrice"})
# create a list of lines corresponding to element texts
lines = [span.get_text() for span in spans]
# collect the dates from the list of lines using regex matching groups
found_dates = []
for line in lines:
m = re.search(r'[USD]+\d{2,3}.\d{2}', line)
if m:
found_dates.append(str(m.group(0)))
# print the dates we collected
# for date in found_dates:
# print(date)
# DATA_J = DATA_CONTAINER[:]
DATA = list(zip(DATA_CONTAINER, found_dates))
print(DATA)
def serializeToCSV(fileName):
with open(fileName, "a") as fd:
for item in DATA:
fd.write(u' '.join(str(item)) + "\n")
print(len(DATA))
serializeToCSV('csv.csv')
collectData()
"a"
选项编写文件serializeToCSV
(第17行)