Question

在第29行获取语法错误，其中包括以下内容：links = parsed_body.xpath（＆＃39; // div [contains（@class，＆＃34; b-thumb-128px＆＃34;）] / a / @ href＆＃39;）]。我已经为目标网站编写了有效的xpath，因此我不确定为什么或如何解决此特定错误。

import requests
from lxml import html
from pprint import pprint
from urlparse import urljoin
from thready import threaded
import os
import math
import csv

CACHE_DIR = os.path.join(os.path.dirname(__file__), 'wanpy')

def get_links():
    STARTING_URL = 'http://example.com/en/search/?h=3&k=&p=1&sid=wan'
    results_per_page = 60
    response = requests.get(STARTING_URL)
    dive = html.fromstring(response.text)
    div = dive.xpath("//div[contains(@class, 'b-tabs-utility')]")[0].text
    last_pg = math.ceil(int(div.split()[-2]) / results_per_page)
    BASE_URL = 'http://example.com/en/search/?h=3&k=&p=%d&sid=wanboo'
    urls = []
    for i in xrange(last_pg):
        response = requests.get(BASE_URL % i)
        parsed_body = html.fromstring(response.text)
        links = parsed_body.xpath('//div[contains(@class, "b-thumb-128px")]//a/@href')]
        for link in links:
            urls.append(link)
    threaded(urls, scrape_inventory, num_threads=10)    


def scrape_inventory():
    with open("data/wan.csv", "w") as f:
        fieldnames = ("model", "title", "description", "price", "image","additional_image", "scrape_url")
        output = csv.writer(f, delimiter="\t")
        output.writerow(fieldnames)
        print "scraping %s ..." % url
        response = requests.get(url)
        parsed_body = html.fromstring(response.text)
        name = re.sub(r'\D\W\S', "", parsed_body.xpath("//h1[contains(@class, 'b-ttl-main')]/text()"))
        #description = re.sub(r'\D\W\S', "", parsed_body.xpath("//div[contains(@class, 'b-container b-editable')]/text()"))
        price = re.sub(r'\D\W\S', "", round(float(parsed_body.xpath("//span[contains(@class, 'b-text-xxlarge b-text-prime')]/text()")) * 2 + 15), 2)

        output.writerow([name, price])


if __name__ == '__main__':
    get_links()

Answer 1

相关行末尾的]未与任何[对齐。

python lxml并请求语法错误

1 个答案: