我用python创建了一个网络抓取程序,我想在“产品尺寸”列中显示“无产品尺寸”。这些产品在亚马逊上没有尺寸显示。应该替换的代码在带有尺寸的try catch块上。我附上了程序。我将不胜感激。
这是此程序的输出!我会喜欢从没有产品尺寸的产品尺寸列中更换插件。
from selenium import webdriver
import csv
import io
proxies = {
'http': 'http://5.189.133.231:80',
'https': 'https://27.111.43.178:8080'
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server="%s"' % ';'.join(['%s=%s' % (k, v) for k, v in proxies.items()]))
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei-PC\Downloads\webdriver\chromedriver.exe",
chrome_options=chrome_options)
header = ['Product title', 'ASIN', 'Product Weight', 'Product dimensions', 'URL']
with open('csv/products.csv', "w") as output:
writer = csv.writer(output)
writer.writerow(header)
links = [
'https://www.amazon.com/Fermtech-regular-Auto-Siphon-Tubing/dp/B06XMR433X/ref=sr_1_2?s=kitchen&ie=UTF8&qid=1520274561&sr=1-2&keywords=-hhhg',
'https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds',
'https://www.amazon.com/Cosmetics-Organizer-Compartments-Christmas-birthday/dp/B01BXDDU04/ref=sr_1_4569/138-3260504-2979110?s=bedbath&ie=UTF8&qid=1520585778&sr=1-4569&keywords=-sdfg'
]
for i in range(len(links)):
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[@id="productTitle"][1]')
prod_title = [x.text for x in product_title]
try:
asin = driver.find_element_by_xpath('(//div[@class ="pdTab"]/table/tbody/tr/td)[24]').text
except:
print('no ASIN template one')
try:
weight = driver.find_element_by_xpath('(//div[@class ="pdTab"]/table/tbody/tr/td)[2]').text
except:
print('no weight template one')
try:
dimension = driver.find_element_by_xpath('(//div[@class ="pdTab"]/table/tbody/tr/td)[4]').text
except:
print('no dimension template one')
try:
asin = driver.find_element_by_xpath('(//table[@id="productDetailsTable"]/tbody/tr/td/div/ul/li)[3]').text
except:
print('no ASIN template two')
try:
weight = driver.find_element_by_xpath('(//table[@id="productDetailsTable"]/tbody/tr/td/div/ul/li)[3]').text
except:
print('no weight template two')
try:
dimension = driver.find_element_by_xpath('(//table[@id="productDetailsTable"]/tbody/tr/td/div/ul/li)[3]').text
except:
print('no dimension template two')
try:
asin = driver.find_element_by_xpath('//li[contains(b, "ASIN:")]').text
except:
print('no ASIN template three')
try:
weight = driver.find_element_by_xpath('//li[contains(b, "Shipping Weight:")]').text
except:
print('no weight template three')
try:
dimension = driver.find_element_by_xpath('//li[contains(b, "Product Dimensions: ")]').text
except:
print('no dimension template three')
try:
data = [prod_title[0], asin, weight, dimension, links[i]]
except:
print('no data')
with io.open('csv/products.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)
答案 0 :(得分:1)
您需要重新排列代码,以便在任何模板中找到ASIN时都不会检查其他模板,因为任何网页都将遵循这三个模板中的任何一个。
jbtw,您的第4,第5和第6个try语句具有相同的代码来提取3个不同的变量。您需要解决该问题。
所以回到您的问题,只需在except块中将维度设置为所需的字符串(即,如果找不到维度)。
try:
dimension = driver.find_element_by_xpath('//li[contains(b, "Product Dimensions: ")]').text
except:
dimension = 'NO PRODUCT DIMENSION'
这是重新排列的代码版本(未经测试),但是我敢肯定它会起作用。
for i in range(len(links)):
asinFound = False
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[@id="productTitle"][1]')
prod_title = [x.text for x in product_title]
if asinFound == False: #try template one
try:
asin = driver.find_element_by_xpath('(//div[@class ="pdTab"]/table/tbody/tr/td)[24]').text
asinFound = True #dont need to try other templates
except:
print('no ASIN template one')
try:
weight = driver.find_element_by_xpath('(//div[@class ="pdTab"]/table/tbody/tr/td)[2]').text
except:
print('no weight template one')
try:
dimension = driver.find_element_by_xpath('(//div[@class ="pdTab"]/table/tbody/tr/td)[4]').text
except:
dimension = 'No Product Dimension'
print('no dimension template one')
if asinFound == False: #try template two
try:
asin = driver.find_element_by_xpath('(//table[@id="productDetailsTable"]/tbody/tr/td/div/ul/li)[3]').text
asinFound = True #dont need to try other templates
except:
print('no ASIN template two')
try:
weight = driver.find_element_by_xpath('(//table[@id="productDetailsTable"]/tbody/tr/td/div/ul/li)[3]').text
except:
print('no weight template two')
try:
dimension = driver.find_element_by_xpath('(//table[@id="productDetailsTable"]/tbody/tr/td/div/ul/li)[3]').text
except:
dimension = 'No Product Dimension'
print('no dimension template two')
if asinFound == False: #try template three
try:
asin = driver.find_element_by_xpath('//li[contains(b, "ASIN:")]').text
except:
print('no ASIN template three')
try:
weight = driver.find_element_by_xpath('//li[contains(b, "Shipping Weight:")]').text
except:
print('no weight template three')
try:
dimension = driver.find_element_by_xpath('//li[contains(b, "Product Dimensions: ")]').text
except:
dimension = 'No Product Dimension'
print('no dimension template three')
try:
data = [prod_title[0], asin, weight, dimension, links[i]]
except:
print('no data')
with io.open('csv/products.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)