使用Google Cloud Natural Language API分析大量文本-接收ssl.SSLError :(“读取操作超时”,)

时间:2018-07-27 15:25:56

标签: python ssl nlp google-cloud-platform

我在扩充Google Cloud Natural Language示例脚本供自己使用时遇到问题。

我有大量文本,试图对其进行实体和情感分析,但是我目前收到以下错误:

    Traceback (most recent call last):
  File "main.py", line 329, in <module>
    analyze(args.inp, sout, eout, args.sample, args.log_file)
  File "main.py", line 288, in analyze
    process_movie_reviews(service, reader, sentiment_writer, entity_writer)
  File "main.py", line 193, in process_movie_reviews
    service, document)
  File "main.py", line 170, in get_sentiment_entities
    sentiments, entities = analyze_document(service, document)
  File "main.py", line 31, in analyze_document
    sentiments, entities = document.extract_sentiment_entities(service)
  File "main.py", line 80, in extract_sentiment_entities
    response = request.execute()
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/oauth2client/_helpers.py", line 133, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/googleapiclient/http.py", line 836, in execute
    method=str(self.method), body=self.body, headers=self.headers)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/googleapiclient/http.py", line 180, in _retry_request
    raise exception
ssl.SSLError: ('The read operation timed out',)

示例脚本(main.py)如下,也可以在here中找到(运行Python 2.7.15):

import argparse
import codecs
import glob
import json
import logging
import os

import googleapiclient.discovery
from googleapiclient.errors import HttpError
import requests

def analyze_document(service, document):
    """Analyze the document and get the distribution of sentiments and
    the movie name."""
    logging.info('Analyzing {}'.format(document.doc_id))

    sentiments, entities = document.extract_sentiment_entities(service)
    return sentiments, entities


def get_request_body(text, syntax=True, entities=True, sentiment=True):
    """Creates the body of the request to the language api in
    order to get an appropriate api response."""
    body = {
        'document': {
            'type': 'PLAIN_TEXT',
            'content': text,
        },
        'features': {
            'extract_syntax': syntax,
            'extract_entities': entities,
            'extract_document_sentiment': sentiment,
        },
        'encoding_type': 'UTF32'
    }

    return body


class Document(object):
    """Document class captures a single document of movie reviews."""

    def __init__(self, text, doc_id, doc_path):
        self.text = text
        self.doc_id = doc_id
        self.doc_path = doc_path
        self.sentiment_entity_pair = None
        self.label = None

    def extract_sentiment_entities(self, service):
        """Extract the sentences in a document."""

        if self.sentiment_entity_pair is not None:
            return self.sentence_entity_pair

        docs = service.documents()
        request_body = get_request_body(
            self.text,
            syntax=False,
            entities=True,
            sentiment=True)
        request = docs.annotateText(body=request_body)

        ent_list = []

        response = request.execute()
        entities = response.get('entities', [])
        documentSentiment = response.get('documentSentiment', {})

        for entity in entities:
            ent_type = entity.get('type')
            wiki_url = entity.get('metadata', {}).get('wikipedia_url')

            if ent_type == 'PERSON' and wiki_url is not None:
                ent_list.append(wiki_url)

        self.sentiment_entity_pair = (documentSentiment, ent_list)

        return self.sentiment_entity_pair


def to_sentiment_json(doc_id, sent, label):
    """Convert the sentiment info to json.

    Args:
        doc_id: Document id
        sent: Overall Sentiment for the document
        label: Actual label +1, 0, -1 for the document

    Returns:
        String json representation of the input

    """
    json_doc = {}

    json_doc['doc_id'] = doc_id
    json_doc['sentiment'] = float('%.3f' % sent)
    json_doc['label'] = label

    return json.dumps(json_doc)


def get_wiki_title(wiki_url):
    """Get the wikipedia page title for a given wikipedia URL.

    Args:
        wiki_url: Wikipedia URL e.g., http://en.wikipedia.org/wiki/Sean_Connery

    Returns:
        Wikipedia canonical name e.g., Sean Connery

    """
    try:
        content = requests.get(wiki_url).text
        return content.split('title')[1].split('-')[0].split('>')[1].strip()
    except KeyError:
        return os.path.basename(wiki_url).replace('_', ' ')


def to_entity_json(entity, entity_sentiment, entity_frequency):
    """Convert entities and their associated sentiment to json.

    Args:
        entity: Wikipedia entity name
        entity_sentiment: Sentiment associated with the entity
        entity_frequency: Frequency of the entity in the corpus

    Returns:
       Json string representation of input

    """
    json_doc = {}

    avg_sentiment = float(entity_sentiment) / float(entity_frequency)

    json_doc['wiki_url'] = entity
    json_doc['name'] = get_wiki_title(entity)
    json_doc['sentiment'] = float('%.3f' % entity_sentiment)
    json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment)

    return json.dumps(json_doc)


def get_sentiment_entities(service, document):
    """Compute the overall sentiment volume in the document.

    Args:
        service: Client to Google Natural Language API
        document: Movie review document (See Document object)

    Returns:
        Tuple of total sentiment and entities found in the document

    """

    sentiments, entities = analyze_document(service, document)
    score = sentiments.get('score')

    return (score, entities)


def get_sentiment_label(sentiment):
    """Return the sentiment label based on the sentiment quantity."""
    if sentiment < 0:
        return -1
    elif sentiment > 0:
        return 1
    else:
        return 0


def process_movie_reviews(service, reader, sentiment_writer, entity_writer):
    """Perform some sentiment math and come up with movie review."""
    collected_entities = {}

    for document in reader:
        try:
            sentiment_total, entities = get_sentiment_entities(
                service, document)
        except HttpError as e:
            logging.error('Error process_movie_reviews {}'.format(e.content))
            continue

        document.label = get_sentiment_label(sentiment_total)

        sentiment_writer.write(
            to_sentiment_json(
                document.doc_id,
                sentiment_total,
                document.label
            )
        )

        sentiment_writer.write('\n')

        for ent in entities:
            ent_sent, frequency = collected_entities.get(ent, (0, 0))
            ent_sent += sentiment_total
            frequency += 1

            collected_entities[ent] = (ent_sent, frequency)

    for entity, sentiment_frequency in collected_entities.items():
        entity_writer.write(to_entity_json(entity, sentiment_frequency[0],
                                           sentiment_frequency[1]))
        entity_writer.write('\n')

    sentiment_writer.flush()
    entity_writer.flush()


def document_generator(dir_path_pattern, count=None):
    """Generator for the input movie documents.

    Args:
        dir_path_pattern: Input dir pattern e.g., "foo/bar/*/*"
        count: Number of documents to read else everything if None

    Returns:
        Generator which contains Document (See above)

    """
    for running_count, item in enumerate(glob.iglob(dir_path_pattern)):
        if count and running_count >= count:
            raise StopIteration()

        doc_id = os.path.basename(item)

        with codecs.open(item, encoding='utf-8') as f:
            try:
                text = f.read()
            except UnicodeDecodeError:
                continue

            yield Document(text, doc_id, item)


def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False):
    """Rank the entities (actors) based on their sentiment
    assigned from the movie."""

    items = []
    for item in reader:
        json_item = json.loads(item)
        sent = json_item.get('sentiment')
        entity_item = (sent, json_item)

        if sentiment:
            if sentiment == 'pos' and sent > 0:
                items.append(entity_item)
            elif sentiment == 'neg' and sent < 0:
                items.append(entity_item)
        else:
            items.append(entity_item)

    items.sort(reverse=reverse_bool)
    items = [json.dumps(item[1]) for item in items]

    print('\n'.join(items[:topn]))


def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file):
    """Analyze the document for sentiment and entities"""

    # Create logger settings
    logging.basicConfig(filename=log_file, level=logging.DEBUG)

    # Create a Google Service object
    service = googleapiclient.discovery.build('language', 'v1')

    reader = document_generator(input_dir, sample)

    # Process the movie documents
    process_movie_reviews(service, reader, sentiment_writer, entity_writer)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    subparsers = parser.add_subparsers(dest='command')

    rank_parser = subparsers.add_parser('rank')

    rank_parser.add_argument(
        '--entity_input', help='location of entity input')
    rank_parser.add_argument(
        '--sentiment', help='filter sentiment as "neg" or "pos"')
    rank_parser.add_argument(
        '--reverse', help='reverse the order of the items', type=bool,
        default=False
        )
    rank_parser.add_argument(
        '--sample', help='number of top items to process', type=int,
        default=None
        )

    analyze_parser = subparsers.add_parser('analyze')

    analyze_parser.add_argument(
        '--inp', help='location of the input', required=True)
    analyze_parser.add_argument(
        '--sout', help='location of the sentiment output', required=True)
    analyze_parser.add_argument(
        '--eout', help='location of the entity output', required=True)
    analyze_parser.add_argument(
        '--sample', help='number of top items to process', type=int)
    analyze_parser.add_argument('--log_file', default='movie.log')

    args = parser.parse_args()

    if args.command == 'analyze':
        with open(args.sout, 'w') as sout, open(args.eout, 'w') as eout:
            analyze(args.inp, sout, eout, args.sample, args.log_file)
    elif args.command == 'rank':
        with open(args.entity_input, 'r') as entity_input:
            rank_entities(
                entity_input, args.sentiment, args.sample, args.reverse)

似乎文本的长度是ssl.SSLError的原因,因此我试图了解如何停止脚本超时。

我已经尝试测试以下代码(取自另一个Google Cloud示例脚本),但是由于脚本包含多个阶段,因此不确定如何将其放入main.py中。

response = operation.result(timeout=100000)

我还尝试使用socket.setdefaulttimeout(60 * 60)在脚本中设置默认套接字超时,但是这未能解决问题(我不确定这是否适合该工作)。

如果任何人都可以向我指出正确的方向,或者提供任何提示(而不是寻找要为我编写的代码),则将不胜感激。

0 个答案:

没有答案