我在使用lxml xpath时遇到问题,下面的示例代码用于通过xpath获取Ul内部的标签Li的所有数据-
"//*[@id ="s-results-list-atf"]/li/@data-asin".
奇怪的是,我只得到了6厘升,而我只有46厘升
请有人帮我看看我的错误所在
p / s:使用python 2.7
from lxml import html
import csv, os, json
import random
import requests
from exceptions import ValueError
from time import sleep
def getAsin():
headers_list = [
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2211.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2111.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.3211.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2221.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2212.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2213.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2214.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2215.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2216.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2217.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2218.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2219.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2231.90 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2241.90 Safari/537.36'},
]
headers = random.choice(headers_list)
url = 'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=t-shirts&rh=i%3Aaps%2Ck%3At-shirts'
page = requests.get(url, headers=headers)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//*[@id="s-results-list-atf"]/li/@data-asin'
RAW_NAME = doc.xpath(XPATH_NAME)
print 'aaaaaaaaa',RAW_NAME
if page.status_code != 200:
raise ValueError('captha')
return RAW_NAME
except Exception as e:
print e
if __name__ == "__main__":
getAsin()
`
答案 0 :(得分:0)
似乎并非所有列表项都出现在列表"#s-results-list-atf"
尝试使用
doc.xpath('//li[starts-with(@id, "result_")]/@data-asin')
获取完整列表(60个项目)