Web刮板int()的无效文字

时间:2019-07-07 00:29:25

标签: python web

需要一些帮助,正在为myanime列表构建一个Web抓取程序,得到的基数为10的int()无效文字。

需要一些帮助,正在为myanime列表构建一个Web抓取程序,得到的基数为10的int()无效文字。

import requests

from bs4 import BeautifulSoup
try:
    import queue
except ImportError:
    import Queue as queue
import threading
import time
import re
import csv

MALCharacterCrawler类(对象):

    def __init__(self):

        self._anime_ids = queue.Queue()
        self._char_ids = queue.Queue()

        self.request_headers = {"User-Agent":"Mozilla/5.0"}


        self._load_anime_ids()

        self._processed = 0

        output_file = open('mal_character2.csv', 'w', newline='')
        self.file_writter = csv.writer(output_file)
        self.file_writter.writerow([
                'anime_id',
                'name',
                'main',
                'favorites',
                'gender_guess',
                'age',
                'birthday',
                'height',
                'weight',
                'zodiac',
                'blood',
                'desc'
                ])


        workers = []

        for i in range(0):
            break
            worker = threading.Thread(target=self._get_animes, args=[i])
            worker.start()
            workers.append(worker)

        self._get_animes(0)

        for worker in workers:
            worker.join()

        output_file.close()

    def _load_anime_ids(self):

        with open('animes_dump.csv', 'r') as animes_file:
            anime_reader = csv.reader(animes_file, delimiter=',', quotechar='"')
            for row in anime_reader:
                if int(row[0]) < 56:
                    continue
                self._anime_ids.put(row[0])

    def _get_animes(self, worker_id):

        print('starting worker {}'.format(worker_id))

        while self._anime_ids.qsize() > 0:
            anime_id = self._anime_ids.get()
            print('worker {} retrieving anime {}'.format(worker_id, anime_id))
            self._get_characters(anime_id)
            self._process_characters(anime_id)


    def _get_characters(self, anime_id):

        url = self._get_page_url(anime_id)
        soup = self._get_page_content(url)

        char_container = soup.find('div', {'class' : 'detail-characters-list'})

        if not char_container:
            print ('characters not found for anime: {}'.format(anime_id))
            return

        anchors = char_container.find_all('a', {'class' : 'fw-n'})

        # get links formmated as character/<charid>/<charname>
        for anchor in anchors:
            link = anchor['href']
            if '/character/' in link:
                link_tokens = link.split('/')
                char_id = link_tokens[-2]
                print(char_id)

                role = anchor.parent.parent.findNext('small').contents[0]
                self._char_ids.put((char_id, role, link))
            else:
                print('not a character link!')


    def _process_characters(self, anime_id):

        while not self._char_ids.empty():
            char_info = self._char_ids.get()
            char_id, role, link = char_info

            soup = self._get_page_content(link)

            name = soup.find('h1', {'class' : 'h1'}).contents[0]
            column = soup.find('td', {'class' : 'borderClass'})
            members_raw = str(column).split('<br/>')[-1]
            members = re.sub(',', '', members_raw.split(' ')[-1].split('\n')[0])


            breadcrumbs = soup.find('div', {'class' : 'breadcrumb'})

            td = breadcrumbs.parent
            for tag in td.find_all('div'):
                tag.replaceWith('')
            for tag in td.find_all('table'):
                tag.replaceWith('')
            for tag in td.find_all('h2'):
                tag.replaceWith('')

            segments = td.text.split('\n')

            data_tags = (
                    'Gender',
                    'Age',
                    'Birthday',
                    'Height',
                    'Weight',
                    'Zodiac',
                    'Blood type',
                    'Desc'
            )

            data = {}
            desc_blocks = []

            for seg in segments:
                if seg.strip() == '':
                    continue

                is_tag = False
                for tag in data_tags:
                    if seg.find(tag) == 0:
                        data[tag] = self._extract_char_data(seg, tag)
                        is_tag = True
                        break

                if is_tag:
                    continue

                desc_blocks.append(seg)
                # print('Line: {}'.format(seg))

            desc = ' '.join(desc_blocks)
            data[data_tags[-1]] = desc.encode('ascii','ignore')
            data[data_tags[0]] = self._predict_gender(desc)

            row = [anime_id, name.encode('ascii','ignore'), role, members]
            row.extend([data.get(tag) for tag in data_tags])

            self.file_writter.writerow(row)

            print(anime_id, char_id, name)
            self._processed += 1

            time.sleep(0.8)
            if self._processed % 100 == 0:
                print('PROCESSED {}!!!'.format(self._processed))

    def _extract_char_data(self, seg, tag):

        return seg[seg.index(tag) + len(tag) + 2:].strip()


    def _predict_gender(self, desc):

        male_keywords = len(re.findall('(\W|^|\.)(he|him)(\W|$)', desc))
        female_keywords = len(re.findall('(\W|^|\.)(she|her)(\W|$)', desc))

        if male_keywords >= female_keywords:
            return 'm'
        return 'f'


    def _get_page_url(self, anime_id):

        url = 'https://myanimelist.net/anime/{}'.format(anime_id)
        return url

    def _get_page_content(self, url):

        while True:
            anime_page = requests.get(url, self.request_headers)
            if anime_page.status_code == 200:
                break
            print('throttled')
            time.sleep(0.8)

        return BeautifulSoup(anime_page.text, 'html.parser')


    def get_name(self, soup):

        name_tag = soup.find('span', itemprop='name')

        if not name_tag:
            return ''

        name = name_tag.string.strip()
        return name

    def get_desc(self, soup):

        desc_tag = soup.find('span', itemprop='description')

        if not desc_tag:
            return ''

        desc = re.sub('\n', ' ', desc_tag.text.strip())
        return desc

    def get_score(self, soup):

        score_tag = soup.find('div', {'data-title' : 'score'})
        if not score_tag:
            return ''

        score = score_tag.string.strip()
        users = re.sub(',', '', score_tag['data-user'].strip().split(' ')[0])
        return score, users


    def get_side_data(self, soup, label):

        label_tag = soup.find('span', string=label)
        if not label_tag:
            print('missing {}'.format(label))
            return ''
        data = label_tag.next_sibling.strip()

        if data == '':
            entries = []
            anchor_tags = label_tag.parent.findChildren('a')
            for anchor_tag in anchor_tags:
                entries.append(anchor_tag.text)
            data = ','.join(entries)

        return data


if __name__ == '__main__':
    crawler = MALCharacterCrawler()

Got   File "mal_character_scraper.py", line 261, in <module>
    crawler = MALCharacterCrawler()
  File "mal_character_scraper.py", line 22, in __init__
    self._load_anime_ids()
  File "mal_character_scraper.py", line 64, in _load_anime_ids
    if int(row[0]) < 56:
ValueError: invalid literal for int() with base 10: 'anime_id'

0 个答案:

没有答案