为什么我的脚本在抓取数据时创建重复项

时间:2019-04-06 13:03:36

标签: python-3.x web-scraping beautifulsoup multiprocessing openpyxl

我写了一个Python脚本来从网站上获取书籍的价格。我为此使用了requestsBeautifulSoup

我正在从excel文件中获取ISBN,并将输出写入CSV文件。我的代码的问题在于它会生成大量重复项。如果我的excel文件具有1000个ISBN,则它将获得100,000个以上的ISBN。

import requests
    from bs4 import BeautifulSoup as bs
    import time
    from fake_useragent import UserAgent
    fav = 'Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)'
    ua = UserAgent(fallback = fav)
    import openpyxl
    import csv
    import os
    import random
    import logging
    logging.basicConfig(filename='ProgramLog.log',level=logging.WARNING, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M %p', filemode='w')
    import multiprocessing


    class Sonya:

        def write(self, isbn : str, price : str):
            outputFile = open('output.csv', 'a+', newline='')
            outputWriter = csv.writer(outputFile)
            outputWriter.writerow([f'{isbn}', f'{price}'])
            outputFile.close()

    ##    def getProxy(self, proxy):
    ##
    ##        htp = 'http://' + proxy
    ##        htps = 'https://' + proxy
    ##
    ##        proxies = {
    ##                  'http': htp,
    ##                  'https': htps,
    ##                }
    ##
    ##        return proxies



        def main(self, isbn : str):

            url = 'https://m.directtextbook.com/ourbuybacktable.php?ean=' + isbn



            header = {'User-Agent':str(ua.random)} #randomizes the user-agent

            #proxies = self.getProxy(proxy)

            response = requests.get(url, headers=header)
            res = response.text
            soup = bs(res, 'html.parser')

            try_again = True
            tries=0
            while try_again:
                try:
                    d,c=str(soup.find('div',{'class':'totalint'}).text).split('\xa0')
                    try_again = False
                except AttributeError:
                    if tries<=2:
                        print("Try again")
                        try_again = True
                        tries+=1
                    else:
                        try_again = False
                        return

            price = d+'.'+c
            price = price.replace('$','')
            print(price)

            self.write(isbn, price)



    #proxy_list = open('proxy.txt').read().split('\n')


    def mainn(li):

        try:

            #proxy = random.choice(proxy_list)


            for isbn in li:
                try:
                    #isbn = str(sheet.cell(row=i+1, column=1).value)

                    s=Sonya()
                    s.main(isbn)
                    print(li.index(isbn))
                except Exception as e:
                    print(e)
                    logging.warning(e)
        except Exception as e:
            print(e)
            logging.warning(e)


    if __name__=='__main__':

        isbns = []
        isbn_wb = openpyxl.load_workbook('isbn.xlsx')
        sheet = isbn_wb.active

        for i in range(sheet.max_row):
            if sheet.cell(row=i+1, column=1).value != '':
                isbns.append(str(sheet.cell(row=i+1, column=1).value))

    ##    print(isbns)
        print(len(isbns))

        d={}

        for i in range(20):
            d[f'is_{i+1}'] = isbns[i*len(isbns)//20:(i+1)*len(isbns)//20]
    ##        print('--------')
    ##        print(isbns[i*len(isbns)//20:(i+1)*len(isbns)//20])
    ##        print('--------')
    ##        print(d[f'is_{i+1}'])


        for i in range(20):

            d[f'th_{i+1}']=multiprocessing.Process(target=mainn, args=(d[f'is_{i+1}'],))



        for i in range(20):
            d[f'th_{i+1}'].start()

当我使用具有1000个ISBN的excel电子表格进行测试时,代码应写成等于或小于1000个ISBN,但它会创建许多重复项,最多可达100,000个。

0 个答案:

没有答案