pymysql未安装,正在寻找用于产生所需结果的备用sql

时间:2019-05-06 13:04:25

标签: sql pymysql

我正在寻求有关使此代码与另一种sql一起使用的帮助。 pysql对我不起作用,因此我想看看是否有人可以建议如何使此代码与其他sql(例如sqllite或sqlalchemy)一起使用。我希望这只是更改代码以使其在其他类型的sql中可行的快速解决方案。谢谢,非常感谢您的帮助。

代码:

import re
import six
import sys
import csv
import logging
import requests
import pymysql.cursors
from io import StringIO
from datetime import datetime
from unicodedata import normalize
from collections import OrderedDict
from requests.auth import AuthBase
from email.utils import parsedate
from tempfile import NamedTemporaryFile
from six.moves.urllib.parse import parse_qs
from requests_oauthlib import OAuth1Session, OAuth1


#########################################################
######### TWITTER CONSUMER KEY AND SECRET ###############
#########################################################

# DOjV7YFYSTlMeH6IWsHM4TfAJ (API key)
# fJFsdqK4bGhq9LoHNeuUXpWJROp84SStjJ1XHJlmr8iKJ0MjrY (API secret key)


#########################################################
######### TWITTER ACCESS TOKEN AND SECRET ###############
#########################################################

# 488695513-OugEeB4xrDhfBrIYhr6N0rgRI9MmyNJSPamY2HxL (Access token)
# zkxsmGoYL37BrCqESJexgEE9qW1QOXfaYAVIFssToDC0l (Access token secret)

KAGGLE_USER_INSERT = "INSERT INTO `kaggle_users` (`unique_id`, `name`, `gender`, `description`, `link_color`, `sidebar_color`, `fav_number`, `retweet_count`, `image`, `created`, `user_timezone`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
KAGGLE_TWEET_INSERT = "INSERT INTO `kaggle_tweets` (`user_id`, `tweet_text`, `tweet_count`, `tweet_created`, `tweet_location`) VALUES (%s, %s, %s, %s, %s)"


TWITTER_USER_INSERT = "INSERT INTO `twitter_users` (`unique_id`, `name`, `gender`, `description`, `link_color`, `sidebar_color`, `fav_number`, `retweet_count`, `image`, `created`, `user_timezone`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
TWITTER_TWEET_INSERT = "INSERT INTO `twitter_tweets` (`user_id`, `tweet_text`, `tweet_count`, `tweet_created`, `tweet_location`) VALUES (%s, %s, %s, %s, %s)"


def is_url(text):
    return bool(re.findall(URL_REGEXP, text))


def http_to_file(http):
    data_file = NamedTemporaryFile()
    req = requests.get(http, stream=True)
    for chunk in req.iter_content(chunk_size=1024 * 1024):
        data_file.write(chunk)
    return data_file


def enf_type(field, _type, val):
    try:
        return _type(val)
    except ValueError:
        print('"{0}" must be type {1}'.format(field, _type.__name__))


def parse_datetime(string):
    return datetime(*(parsedate(string)[:6]))

def contains_valid_data(row):
    sanitized_row = []
    for item in row:
        if item.strip():
            sanitized_row.append(item.strip())
    return len(sanitized_row) != 0


class CustomDictReader(csv.DictReader):
    def __next__(self):
        if self.line_num == 0:
            # Used only for its side effect.
            self.fieldnames
        row = next(self.reader)
        self.line_num = self.reader.line_num

        # unlike the basic reader, we prefer not to return blanks,
        # because we will typically wind up with a dict full of None
        # values
        while row == [] or not contains_valid_data(row):
            row = next(self.reader)
        d = OrderedDict(zip(self.fieldnames, row))
        lf = len(self.fieldnames)
        lr = len(row)
        if lf < lr:
            d[self.restkey] = row[lf:]
        elif lf > lr:
            for key in self.fieldnames[lr:]:
                d[key] = self.restval
        return d


class TwitterOAuthHandler:

    OAUTH_HOST = 'api.twitter.com'
    OAUTH_ROOT = '/oauth/'

    def __init__(self, consumer_key, consumer_secret, access_token=None, access_token_secret=None):
        if type(consumer_key) == six.text_type:
            consumer_key = consumer_key.encode('ascii')

        if type(consumer_secret) == six.text_type:
            consumer_secret = consumer_secret.encode('ascii')

        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_token_secret = access_token_secret
        self.username = None
        self.request_token = {}
        self.oauth = OAuth1Session(
            consumer_key,
            client_secret=consumer_secret
        )

    def _get_oauth_url(self, endpoint):
        return 'https://' + self.OAUTH_HOST + self.OAUTH_ROOT + endpoint

    def apply_auth(self):
        return OAuth1(
            self.consumer_key,
            client_secret=self.consumer_secret,
            resource_owner_key=self.access_token,
            resource_owner_secret=self.access_token_secret,
            decoding=None
        )

    def _get_request_token(self, access_type=None):
        try:
            url = self._get_oauth_url('request_token')
            if access_type:
                url += '?x_auth_access_type=%s' % access_type
            return self.oauth.fetch_request_token(url)
        except Exception as e:
            print(e)

    def set_access_token(self, key, secret):
        self.access_token = key
        self.access_token_secret = secret


    def get_authorization_url(
        self,
        signin_with_twitter=False,
        access_type=None
    ):
        try:
            if signin_with_twitter:
                url = self._get_oauth_url('authenticate')
                if access_type:
                    logging.warning(WARNING_MESSAGE)
            else:
                url = self._get_oauth_url('authorize')
            self.request_token = self._get_request_token(access_type=access_type)
            return self.oauth.authorization_url(url)
        except Exception as e:
            print(e)

    def get_username(self):
        if self.username is None:
            api = API(method="GET", auth_handler=self)
            user = api.verify_credentials()
            if user:
                self.username = user["screen_name"]
            else:
                raise TweepError('Unable to get username,'
                                 ' invalid oauth token!')
        return self.username



class APIError(Exception):
    """API exception"""

    def __init__(self, reason, response=None, api_code=None):
        self.reason = six.text_type(reason)
        self.response = response
        self.api_code = api_code
        Exception.__init__(self, reason)

    def __str__(self):
        return self.reason



class API(object):
    """Twitter API"""

    def __init__(
        self,
        method,
        auth_handler=None,
        host='api.twitter.com',
        search_host='search.twitter.com',
        upload_host='upload.twitter.com',
        api_root='/1.1',
        search_root='',
        upload_root='/1.1',
        timeout=60,
    ):
        self.method = method
        self.auth = auth_handler
        self.host = host
        self.search_host = search_host
        self.upload_host = upload_host
        self.api_root = api_root
        self.search_root = search_root
        self.upload_root = upload_root
        self.timeout = timeout
        self.session = requests.Session()


    def full_url(self, request_path):
        url = self.api_root + request_path
        return 'https://' + self.host + url


    def make_request(self, request_path):
        auth = None
        if self.auth:
            auth = self.auth.apply_auth()

        print(self.full_url(request_path))

        try:
            resp = self.session.request(
                self.method,
                self.full_url(request_path),
                timeout=self.timeout,
                auth=auth,
            )
            return resp.json()
        except Exception as e:
            six.reraise(APIError, APIError('Failed to send request: %s' % e), sys.exc_info()[2])


    def verify_credentials(self):
        return self.make_request("/account/verify_credentials.json")

    def home_timeline(self):
        return self.make_request("/statuses/home_timeline.json")

    def user_timeline(self):
        return self.make_request("/statuses/user_timeline.json")

# _unit_id
# _golden
# _unit_state
# _trusted_judgments
# _last_judgment_at
# gender
# gender:confidence
# profile_yn
# profile_yn:confidence
# created
# description
# fav_number
# gender_gold
# link_color
# name
# profile_yn_gold
# profileimage
# retweet_count
# sidebar_color
# text
# tweet_coord
# tweet_count
# tweet_created
# tweet_id
# tweet_location
# user_timezone

def user_exists(db_cursor, user_id, table_name):
    user_exists_sql = "SELECT unique_id FROM %s WHERE unique_id=%s" % (table_name, user_id)
    db_cursor.execute(user_exists_sql)
    result = db_cursor.fetchone()
    return result is not None

def connect_db():
    return pymysql.connect(
        host='localhost',
        user='root',
        password='',
        db='twitter_gender_analysis',
        charset='utf8mb4',
        autocommit=True,
        cursorclass=pymysql.cursors.DictCursor
    )


def process_data_set(csv_file_path="/Users/prashuchaudhary/Downloads/twitter_gender_classification/gender_data.csv"):
    connection = connect_db()
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = CustomDictReader(csv_file, skipinitialspace=True, restval="")
        try:
            with connection.cursor() as cursor:
                for row in csv_reader:
                    try:

                        if not user_exists(cursor, row['_unit_id'], 'kaggle_users'):
                            cursor.execute(
                                KAGGLE_USER_INSERT,
                                (
                                    row['_unit_id'],
                                    re.sub(r'[^\x00-\x7F]+',' ', row['name']),
                                    row['gender'],
                                    re.sub(r'[^\x00-\x7F]+',' ', row['description']),
                                    row['link_color'],
                                    row['sidebar_color'],
                                    row['fav_number'],
                                    row['retweet_count'],
                                    row['profileimage'],
                                    str(datetime.strptime(row['created'], "%m/%d/%y %H:%M")),
                                    row['user_timezone']
                                )
                            )

                        cursor.execute(
                            KAGGLE_TWEET_INSERT,
                            (
                                row['_unit_id'],
                                re.sub(r'[^\x00-\x7F]+',' ', row['text']),
                                row['tweet_count'],
                                str(datetime.strptime(row['tweet_created'], "%m/%d/%y %H:%M")),
                                row['tweet_location']
                            )
                        )
                    except Exception as exc:
                        print("Error while populating row {row} :: {err}".format(row=row, err=exc))
        finally:
            connection.close()


def process_user_timeline_tweets():
    twitter_oauth = TwitterOAuthHandler(
        consumer_key="DOjV7YFYSTlMeH6IWsHM4TfAJ",
        consumer_secret="fJFsdqK4bGhq9LoHNeuUXpWJROp84SStjJ1XHJlmr8iKJ0MjrY",
        access_token="488695513-OugEeB4xrDhfBrIYhr6N0rgRI9MmyNJSPamY2HxL",
        access_token_secret="zkxsmGoYL37BrCqESJexgEE9qW1QOXfaYAVIFssToDC0l"
    )
    api = API(method="GET", auth_handler=twitter_oauth)
    tweets = api.user_timeline()
    connection = connect_db()

    try:
        with connection.cursor() as cursor:
            for tweet in tweets:
                try:
                    if not user_exists(cursor, tweet['user']['id'], 'twitter_users'):
                        cursor.execute(
                            TWITTER_USER_INSERT,
                            (
                                tweet['user']['id'],
                                re.sub(r'[^\x00-\x7F]+',' ', tweet['user']['name']),
                                '',
                                re.sub(r'[^\x00-\x7F]+',' ', tweet['user']['description']),
                                tweet['user']['profile_link_color'],
                                tweet['user']['profile_sidebar_border_color'],
                                tweet['user']['favourites_count'],
                                0,
                                tweet['user']['profile_image_url'],
                                str(datetime.strftime(datetime.strptime(tweet['user']['created_at'], "%a %b %d %H:%M:%S %z %Y"), "%Y-%m-%d %H:%M:%S")),
                                tweet['user']['time_zone']
                            )
                        )

                    cursor.execute(
                        TWITTER_TWEET_INSERT,
                        (
                            tweet['user']['id'],
                            re.sub(r'[^\x00-\x7F]+',' ', tweet['text']),
                            tweet['retweet_count'],
                            str(datetime.strftime(datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S %z %Y"), "%Y-%m-%d %H:%M:%S")),
                            tweet['place']
                        )
                    )
                except Exception as exc:
                    print("Error while populating row {row} :: {err}".format(row=row, err=exc))
    finally:
            connection.close()

if __name__ == '__main__':
    process_data_set()
    process_user_timeline_tweets()

0 个答案:

没有答案