适用于Target的Python网络抓取工具

时间:2018-10-10 14:20:57

标签: python web-scraping

我是一位新手程序员,正在尝试为Target.com部署python网络抓取工具。我粘贴了下面的代码。

我要解决的问题是运行脚本时,结论没有创建csv文件。 Web浏览器将打开,并且数据正在显示高品质的文本,因此我对为何不显示output.csv感到困惑。

非常感谢您的帮助。谢谢!

import requests
import csv
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import html


cats = [
    ('Natural Laundry Detergent','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Z55t1q?Nao=0'),
    ('Natural All-Purpose Cleaner','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Zzag5n?Nao=0'),
    ('Natural Dish Soaps','https://www.target.com/c/natural-cleaning-supplies-household-essentials/-/N-4yjz7Zx6dg5?Nao=0'),
    ('Natural Hair Shampoo','https://www.target.com/c/natural-hair-care/-/N-4smdrZ56ecv?Nao=0'),
    ('Natural Hair Conditioner','https://www.target.com/c/natural-hair-care/-/N-4smdrZv1cqo?Nao=0'),
    ('Natural Body Wash','https://www.target.com/c/natural-personal-care/-/N-4smdpZ5td3p?Nao=0'),
    ('Baby Shampoo and Body Wash','https://www.target.com/c/baby-toiletries-bath-potty/-/N-5xtjdZ54wt4?Nao=0'),
    ('Baby Bath Wash' ,'https://www.target.com/c/baby-toiletries-bath-potty/baby-bath-wash/-/N-5xtjdZ5ri3m'),
    ('Baby Bubble Bath' ,'https://www.target.com/c/baby-toiletries-bath-potty/-/N-5xtjdZ5t3hx?Nao=0'),
    ('Stain Removers', 'https://www.target.com/s?searchTerm=stain+remover&facetedValue=56cpg&Nao=0'),
    ('Baby Lotions', 'https://www.target.com/c/baby-toiletries-bath-potty/baby-lotions/-/N-5xtjdZ5vg2t'),
    ('Tampons','https://www.target.com/c/tampons-feminine-products-personal-care/-/N-4y634'),
    ('Maxi Pads','https://www.target.com/c/maxi-pads-feminine-products-personal-care/-/N-4y633'),
    ('Feminine Hygiene','https://www.target.com/c/feminine-hygiene-products-personal-care/-/N-4y631'),
]


class TargetClient(object):
    def __init__(self):
        self.wd = webdriver.Chrome(executable_path=r'C:\Users\wquar\AppData\Local\Programs\Python\Python37\chromedriver.exe')
        self.base_url = 'https://www.target.com'

    def gather_product_links(self):
        soup = BeautifulSoup(self.wd.page_source)
        divs = soup.select('div[class*="ProductCardImageWrapper"]')
        links = [self.base_url + d.a['href'] for d in divs]
        return links

    def goto_next_page(self):
        ele = self.wd.find_element_by_xpath("//a[@aria-label='next page']")
        ele.click()
        time.sleep(1.5)


    def _format_product_name(self,input):
        out = input.replace('®','').replace('\x99','')
        return out

    def _format_brand_name(self,input):
        out = input.replace('®','').replace('\x99','')
        out = html.unescape(out)

        if out == "Murphy's":
            out = 'Murphy'
        elif out == 'ECOS by Earth Friendly Products':
            out = 'Ecos'
        elif out == 'Eden Body Works':
            out = 'EDEN BodyWorks'
        elif out == 'BRÖÖ':
            out = 'BRöö'
        elif out == 'Love Beauty & Planet':
            out = 'Love Beauty And Planet'
        elif out == 'Hask':
            out = 'HASK'
        elif out == 'Palmers':
            out = "Palmer's"
        elif out == 'MacVoil':
            out = "Macvoil"
        elif out == 'Dear Clark,':
            out = "Dear Clark"
        elif out == 'Earth Science Naturals':
            out = "Earth Science"
        elif out == 'PAW Patrol':
            out = "Paw Patrol"
        elif out == 'up & up™':
            out = "Up&Up"
        elif out == 'Johnson & Johnson':
            out = "Johnson's"
        elif out == 'Earth Mama Angel Baby':
            out = "Earth Mama"
        elif out == 'Mielle Organics':
            out = "Mielle"
        elif out == 'EveryDay Coconut':
            out = "Alaffia"
        elif out == 'Olivina':
            out = "OLIVINA MEN"
        elif out == 'AVALON':
            out = "Avalon"
        elif out == 'Oxi Clean':
            out = "OxiClean"
        elif out == 'Village Naturals':
            out = "Nourishing Naturals"
        elif out == 'everyone':
            out = "everyone"
        elif out == 'Savannah Bee Company':
            out = 'Savannah Bee'
        elif out == 'Camille Rose Natural':
            out = 'Camille Rose'

        return out

    def _get_product_name(self, complete_product_name, brand_name):
        if brand_name == 'Alaffia':
            return complete_product_name.split(' -')[0].strip()
        elif brand_name == 'SoCozy' and 'So Cfl ozy' in complete_product_name:
            return complete_product_name.split('So Cfl ozy')[1].split(' -')[0].strip()
        elif brand_name == 'Ecos' and 'ECOS' in complete_product_name:
            return complete_product_name.split('ECOS')[1].split(' -')[0].strip()
        elif brand_name == 'Clorox 2' and 'Clorox2' in complete_product_name:
            return complete_product_name.split('Clorox2')[1].split(' -')[0].strip()

        product_name = complete_product_name.split(brand_name)[1].split(' -')[0].strip()
        return product_name

    def scrape_product_page(self, url, category):
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        d = {}

        try:
            complete_product_name = soup('span',attrs={'data-test':'product-title'})[0].text
        except:
            print('ERROR')
            return None

        complete_product_name = self._format_product_name(complete_product_name)

        print(complete_product_name)

        brand_name = soup.select('div[class*="ProductDetailsTitle"]')[0].text.split('Shop all')[-1].strip()
        brand_name = self._format_brand_name(brand_name)

        d['Brand'] = brand_name

        #return (complete_product_name, brand_name)

        try:
            product_name = self._get_product_name(complete_product_name,brand_name)
        except:
            print('PRODUCT ERROR')
            print('PRODUCT ERROR')
            return None

        d['Product'] = product_name

        try:
            d['Capacity'] = soup('b',text='Capacity (Volume):')[0].next.next.strip()
        except:
            d['Capacity'] = self._parse_capacity_from_title(complete_product_name)

        try:
            d['Scent'] = soup('b',text='Scent:')[0].next.next.strip()
        except:
            d['Scent'] = ''

        try:
            d['Price'] = soup('div',attrs={'data-test':'product-price'})[0].span.text
        except:
            d['Price'] = ''

        try:
            d['Product Form'] = soup('b',text='Product Form:')[0].next.next.strip()
        except:
            d['Product Form'] = ''

        try:
            star_rating =soup('div',attrs={'data-ref':'rating-mask'})[0].attrs['style'].split('width:')[1]
            d['Star Rating'] = round(float(star_rating.split('%')[0]) / 20, 2)
        except:
            d['Star Rating']=''

        try:
            d['Number of Ratings'] = soup('span',attrs={'data-test':'ratingCount'})[0].text.strip()
            if d['Number of Ratings'] == 'be the first!':
                d['Number of Ratings'] = 0
        except:
            d['Number of Ratings'] = ''

        try:
            d['Health Facts'] = soup('b',text='Health Facts:')[0].next.next.strip()
        except:
            d['Health Facts'] = ''

        try:
            d['Features'] = soup('b',text='Features:')[0].next.next.strip()
        except:
            d['Features'] = ''

        try:
            d['Wellness Standard'] = soup('b',text='Wellness Standard:')[0].next.next.strip()
        except:
            d['Wellness Standard'] = ''

        try:
            d['Sustainability Claims'] = soup('b',text='Sustainability Claims:')[0].next.next.strip()
        except:
            d['Sustainability Claims'] = ''


        try:
            d['Number of Uses'] = soup('b',text='Number of uses:')[0].next.next.strip()
        except:
            d['Number of Uses'] = self._parse_num_uses_from_title(complete_product_name)


        try:
            d['UPC Code'] = soup('b',text='UPC')[0].next.next.next.next.strip()
        except:
            d['UPC Code'] = ''

        d['URL'] = url
        d['Category'] = category
        d['Package Quantity'] = self._parse_quant_from_title(complete_product_name)

        return d

    def _parse_capacity_from_title(self,input):
        m = re.search('\d+(\.\d)? ?(fl)? ?oz',input,re.IGNORECASE)

        if m:
            return m.group()
        return ''

    def _parse_quant_from_title(self,input):
        m = re.search('\d+ ?pk',input)

        if m:
            return m.group().split('pk')[0].strip()
        return 1

    def _parse_num_uses_from_title(self,input):
        m = re.search('\d+ ?ct',input)
        if m:
            return m.group().split('ct')[0]
        return ''

    def scrape_cat(self, cat_name, url):
        h = []
        self.wd.get(url)
        links = self.gather_all_product_links()
        for l in links:
            print (l)
            res = self.scrape_product_page(l, cat_name)
            h.append(res)
        return h

    def gather_all_product_links(self):
        links = self.gather_product_links()
        while True:
            try:
                self.goto_next_page()
                links.extend(self.gather_product_links())
            except:
                return [l for l in list(set(links)) if '-category-' not in l]

def main():
    h = []
    targ = TargetClient()
    for cat_name, url in cats:
        data = targ.scrape_cat(cat_name, url)
        h.extend(data)
    return h
    write_csv(data)

def write_csv(data):
    data = [x for x in data if x]
    f = open('output.csv','w')
    fields = ['Category','Brand', 'Product', 'Scent', 'Price','Package Quantity','Product Form', 'Capacity', 'Number of Uses', 'Star Rating', 'Number of Ratings', 'Health Facts', 'Features', 'Wellness Standard', 'Sustainability Claims', 'UPC Code', 'URL'] 
    dw = csv.DictWriter(f,fieldnames=fields)
    dw.writeheader()
    dw.writerows(data)


if __name__ == '__main__':
    main()

1 个答案:

答案 0 :(得分:1)

您必须在main()函数返回之前放置write_csv调用。

喜欢:

def main():
    h = []
    targ = TargetClient()
    for cat_name, url in cats:
        data = targ.scrape_cat(cat_name, url)
        h.extend(data)
        write_csv(data)        
    return h

希望它的帮助。