import requests
from lxml import html
from pprint import pprint
from urlparse import urljoin
from thready import threaded
import dataset
import os
from hashlib import sha1
import re
import math
# Inventory link
STARTING_URL = 'http://example.com/en/search/?h=3&k=&p=%d&sid=w'
BASE_URL = 'http://example.com'
# connect to our database
db = dataset.connect('mysql://root:@localhost/opencartdb')
# a directory for images
IMAGE_DIR = os.path.join(os.path.dirname(__file__), 'wimagepy')
def url_to_filename(image_url):
""" Make a URL into a file name, using SHA1 hashes. """
# use a sha1 hash to convert the url into a unique filename
hash_file = sha1(image_url).hexdigest() + '.jpg'
return os.path.join(IMAGE_DIR, hash_file)
def store_local(image_url, content):
""" Save a local copy of the image file. """
# If the image directory does not exist, make one.
if not os.path.isdir(IMAGE_DIR):
# Save to disk.
local_path = url_to_filename(image_url)
with open(local_path, 'wb') as f:
def scrape_raku_inventory():
""" Scrape all the inventory pages from a list """
response = requests.get('http://example.com/en/search/?h=3&k=&p=1&sid=w')
results_per_page = 60
div = page.xpath("//div[contains(@class, 'b-tabs-utility')]")[0].text
last_pg = math.ceil(int(div.split()[-2]) / results_per_page)
for i in xrange(last_pg):
response = requests.get(STARTING_URL % i)
parsed_body = html.fromstring(response.content)
urls = []
links = [urlparse.urljoin(response.url, url) for url in parsed_body.xpath("//div[contains(@class, 'b-thumb-128px')]//a")]
for link in links:
url = urljoin(BASE_URL, link)
# iteratively populate this list
# download and parse inventory via multiple threads
threaded(urls, scrape_inventory_page, num_threads=10)
def scrape_inventory_page(url):
"""Extract information from individual item page"""
# log the url we're scraping
print "scraping %s ..." % url
# retrieve the inventory page with requests
response = requests.get(url)
# Parse the html of the inventory page
parsed_body = html.fromstring(response.content)
# Download images
image_urls = re.sub(r'_ex=50x50\?', "", parsed_body.xpath("//img[contains(@src, '_ex50x50')]/@src"))
for image_url in image_urls:
data = {
'scrape_url': url,
'name': re.sub(r'\D\W\S', "", parsed_body.xpath("//h1[contains(@class, 'b-ttl-main')]").text)
# Upsert data into database
db['raku'].upsert(data, ['scrape_url'])
if __name__ == '__name__':
答案 0 :(得分:3)
if __name__ == '__main__'