多线程lxml scraper执行时没有任何错误或输出

时间:2014-04-29 22:04:04

标签: python web-scraping lxml python-requests


import requests
from lxml import html
from pprint import pprint
from urlparse import urljoin
from thready import threaded
import dataset
import os
from hashlib import sha1
import re
import math

# Inventory link
STARTING_URL = 'http://example.com/en/search/?h=3&k=&p=%d&sid=w'
BASE_URL = 'http://example.com'

# connect to our database
db = dataset.connect('mysql://root:@localhost/opencartdb')

# a directory for images
IMAGE_DIR = os.path.join(os.path.dirname(__file__), 'wimagepy')

def url_to_filename(image_url):
    """ Make a URL into a file name, using SHA1 hashes. """
    # use a sha1 hash to convert the url into a unique filename
    hash_file = sha1(image_url).hexdigest() + '.jpg'
    return os.path.join(IMAGE_DIR, hash_file)

def store_local(image_url, content):
    """ Save a local copy of the image file. """
    # If the image directory does not exist, make one.
    if not os.path.isdir(IMAGE_DIR):
        # Save to disk.
    local_path = url_to_filename(image_url)
    with open(local_path, 'wb') as f:

def scrape_raku_inventory():
    """ Scrape all the inventory pages from a list """
    response = requests.get('http://example.com/en/search/?h=3&k=&p=1&sid=w')
    results_per_page = 60
    div = page.xpath("//div[contains(@class, 'b-tabs-utility')]")[0].text
    last_pg = math.ceil(int(div.split()[-2]) / results_per_page)
    for i in xrange(last_pg):
        response = requests.get(STARTING_URL % i)
        parsed_body = html.fromstring(response.content)
        urls = []
        links = [urlparse.urljoin(response.url, url) for url in parsed_body.xpath("//div[contains(@class, 'b-thumb-128px')]//a")]
        for link in links:
            url = urljoin(BASE_URL, link)
            # iteratively populate this list 

    # download and parse inventory via multiple threads
    threaded(urls, scrape_inventory_page, num_threads=10)

def scrape_inventory_page(url):
    """Extract information from individual item page"""
    # log the url we're scraping
    print "scraping %s ..." % url
    # retrieve the inventory page with requests
    response = requests.get(url)
    # Parse the html of the inventory page
    parsed_body = html.fromstring(response.content)
    # Download images
    image_urls = re.sub(r'_ex=50x50\?', "", parsed_body.xpath("//img[contains(@src, '_ex50x50')]/@src"))
    for image_url in image_urls:
    data = {
    'scrape_url': url,
    'name': re.sub(r'\D\W\S', "", parsed_body.xpath("//h1[contains(@class, 'b-ttl-main')]").text)
    # Upsert data into database
    db['raku'].upsert(data, ['scrape_url'])

if  __name__ == '__name__':


1 个答案:

答案 0 :(得分:3)


if __name__ == '__main__'
