我正在寻求有关使此代码与另一种sql一起使用的帮助。 pysql对我不起作用,因此我想看看是否有人可以建议如何使此代码与其他sql(例如sqllite或sqlalchemy)一起使用。我希望这只是更改代码以使其在其他类型的sql中可行的快速解决方案。谢谢,非常感谢您的帮助。
代码:
import re
import six
import sys
import csv
import logging
import requests
import pymysql.cursors
from io import StringIO
from datetime import datetime
from unicodedata import normalize
from collections import OrderedDict
from requests.auth import AuthBase
from email.utils import parsedate
from tempfile import NamedTemporaryFile
from six.moves.urllib.parse import parse_qs
from requests_oauthlib import OAuth1Session, OAuth1
#########################################################
######### TWITTER CONSUMER KEY AND SECRET ###############
#########################################################
# DOjV7YFYSTlMeH6IWsHM4TfAJ (API key)
# fJFsdqK4bGhq9LoHNeuUXpWJROp84SStjJ1XHJlmr8iKJ0MjrY (API secret key)
#########################################################
######### TWITTER ACCESS TOKEN AND SECRET ###############
#########################################################
# 488695513-OugEeB4xrDhfBrIYhr6N0rgRI9MmyNJSPamY2HxL (Access token)
# zkxsmGoYL37BrCqESJexgEE9qW1QOXfaYAVIFssToDC0l (Access token secret)
KAGGLE_USER_INSERT = "INSERT INTO `kaggle_users` (`unique_id`, `name`, `gender`, `description`, `link_color`, `sidebar_color`, `fav_number`, `retweet_count`, `image`, `created`, `user_timezone`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
KAGGLE_TWEET_INSERT = "INSERT INTO `kaggle_tweets` (`user_id`, `tweet_text`, `tweet_count`, `tweet_created`, `tweet_location`) VALUES (%s, %s, %s, %s, %s)"
TWITTER_USER_INSERT = "INSERT INTO `twitter_users` (`unique_id`, `name`, `gender`, `description`, `link_color`, `sidebar_color`, `fav_number`, `retweet_count`, `image`, `created`, `user_timezone`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
TWITTER_TWEET_INSERT = "INSERT INTO `twitter_tweets` (`user_id`, `tweet_text`, `tweet_count`, `tweet_created`, `tweet_location`) VALUES (%s, %s, %s, %s, %s)"
def is_url(text):
return bool(re.findall(URL_REGEXP, text))
def http_to_file(http):
data_file = NamedTemporaryFile()
req = requests.get(http, stream=True)
for chunk in req.iter_content(chunk_size=1024 * 1024):
data_file.write(chunk)
return data_file
def enf_type(field, _type, val):
try:
return _type(val)
except ValueError:
print('"{0}" must be type {1}'.format(field, _type.__name__))
def parse_datetime(string):
return datetime(*(parsedate(string)[:6]))
def contains_valid_data(row):
sanitized_row = []
for item in row:
if item.strip():
sanitized_row.append(item.strip())
return len(sanitized_row) != 0
class CustomDictReader(csv.DictReader):
def __next__(self):
if self.line_num == 0:
# Used only for its side effect.
self.fieldnames
row = next(self.reader)
self.line_num = self.reader.line_num
# unlike the basic reader, we prefer not to return blanks,
# because we will typically wind up with a dict full of None
# values
while row == [] or not contains_valid_data(row):
row = next(self.reader)
d = OrderedDict(zip(self.fieldnames, row))
lf = len(self.fieldnames)
lr = len(row)
if lf < lr:
d[self.restkey] = row[lf:]
elif lf > lr:
for key in self.fieldnames[lr:]:
d[key] = self.restval
return d
class TwitterOAuthHandler:
OAUTH_HOST = 'api.twitter.com'
OAUTH_ROOT = '/oauth/'
def __init__(self, consumer_key, consumer_secret, access_token=None, access_token_secret=None):
if type(consumer_key) == six.text_type:
consumer_key = consumer_key.encode('ascii')
if type(consumer_secret) == six.text_type:
consumer_secret = consumer_secret.encode('ascii')
self.consumer_key = consumer_key
self.consumer_secret = consumer_secret
self.access_token = access_token
self.access_token_secret = access_token_secret
self.username = None
self.request_token = {}
self.oauth = OAuth1Session(
consumer_key,
client_secret=consumer_secret
)
def _get_oauth_url(self, endpoint):
return 'https://' + self.OAUTH_HOST + self.OAUTH_ROOT + endpoint
def apply_auth(self):
return OAuth1(
self.consumer_key,
client_secret=self.consumer_secret,
resource_owner_key=self.access_token,
resource_owner_secret=self.access_token_secret,
decoding=None
)
def _get_request_token(self, access_type=None):
try:
url = self._get_oauth_url('request_token')
if access_type:
url += '?x_auth_access_type=%s' % access_type
return self.oauth.fetch_request_token(url)
except Exception as e:
print(e)
def set_access_token(self, key, secret):
self.access_token = key
self.access_token_secret = secret
def get_authorization_url(
self,
signin_with_twitter=False,
access_type=None
):
try:
if signin_with_twitter:
url = self._get_oauth_url('authenticate')
if access_type:
logging.warning(WARNING_MESSAGE)
else:
url = self._get_oauth_url('authorize')
self.request_token = self._get_request_token(access_type=access_type)
return self.oauth.authorization_url(url)
except Exception as e:
print(e)
def get_username(self):
if self.username is None:
api = API(method="GET", auth_handler=self)
user = api.verify_credentials()
if user:
self.username = user["screen_name"]
else:
raise TweepError('Unable to get username,'
' invalid oauth token!')
return self.username
class APIError(Exception):
"""API exception"""
def __init__(self, reason, response=None, api_code=None):
self.reason = six.text_type(reason)
self.response = response
self.api_code = api_code
Exception.__init__(self, reason)
def __str__(self):
return self.reason
class API(object):
"""Twitter API"""
def __init__(
self,
method,
auth_handler=None,
host='api.twitter.com',
search_host='search.twitter.com',
upload_host='upload.twitter.com',
api_root='/1.1',
search_root='',
upload_root='/1.1',
timeout=60,
):
self.method = method
self.auth = auth_handler
self.host = host
self.search_host = search_host
self.upload_host = upload_host
self.api_root = api_root
self.search_root = search_root
self.upload_root = upload_root
self.timeout = timeout
self.session = requests.Session()
def full_url(self, request_path):
url = self.api_root + request_path
return 'https://' + self.host + url
def make_request(self, request_path):
auth = None
if self.auth:
auth = self.auth.apply_auth()
print(self.full_url(request_path))
try:
resp = self.session.request(
self.method,
self.full_url(request_path),
timeout=self.timeout,
auth=auth,
)
return resp.json()
except Exception as e:
six.reraise(APIError, APIError('Failed to send request: %s' % e), sys.exc_info()[2])
def verify_credentials(self):
return self.make_request("/account/verify_credentials.json")
def home_timeline(self):
return self.make_request("/statuses/home_timeline.json")
def user_timeline(self):
return self.make_request("/statuses/user_timeline.json")
# _unit_id
# _golden
# _unit_state
# _trusted_judgments
# _last_judgment_at
# gender
# gender:confidence
# profile_yn
# profile_yn:confidence
# created
# description
# fav_number
# gender_gold
# link_color
# name
# profile_yn_gold
# profileimage
# retweet_count
# sidebar_color
# text
# tweet_coord
# tweet_count
# tweet_created
# tweet_id
# tweet_location
# user_timezone
def user_exists(db_cursor, user_id, table_name):
user_exists_sql = "SELECT unique_id FROM %s WHERE unique_id=%s" % (table_name, user_id)
db_cursor.execute(user_exists_sql)
result = db_cursor.fetchone()
return result is not None
def connect_db():
return pymysql.connect(
host='localhost',
user='root',
password='',
db='twitter_gender_analysis',
charset='utf8mb4',
autocommit=True,
cursorclass=pymysql.cursors.DictCursor
)
def process_data_set(csv_file_path="/Users/prashuchaudhary/Downloads/twitter_gender_classification/gender_data.csv"):
connection = connect_db()
with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
csv_reader = CustomDictReader(csv_file, skipinitialspace=True, restval="")
try:
with connection.cursor() as cursor:
for row in csv_reader:
try:
if not user_exists(cursor, row['_unit_id'], 'kaggle_users'):
cursor.execute(
KAGGLE_USER_INSERT,
(
row['_unit_id'],
re.sub(r'[^\x00-\x7F]+',' ', row['name']),
row['gender'],
re.sub(r'[^\x00-\x7F]+',' ', row['description']),
row['link_color'],
row['sidebar_color'],
row['fav_number'],
row['retweet_count'],
row['profileimage'],
str(datetime.strptime(row['created'], "%m/%d/%y %H:%M")),
row['user_timezone']
)
)
cursor.execute(
KAGGLE_TWEET_INSERT,
(
row['_unit_id'],
re.sub(r'[^\x00-\x7F]+',' ', row['text']),
row['tweet_count'],
str(datetime.strptime(row['tweet_created'], "%m/%d/%y %H:%M")),
row['tweet_location']
)
)
except Exception as exc:
print("Error while populating row {row} :: {err}".format(row=row, err=exc))
finally:
connection.close()
def process_user_timeline_tweets():
twitter_oauth = TwitterOAuthHandler(
consumer_key="DOjV7YFYSTlMeH6IWsHM4TfAJ",
consumer_secret="fJFsdqK4bGhq9LoHNeuUXpWJROp84SStjJ1XHJlmr8iKJ0MjrY",
access_token="488695513-OugEeB4xrDhfBrIYhr6N0rgRI9MmyNJSPamY2HxL",
access_token_secret="zkxsmGoYL37BrCqESJexgEE9qW1QOXfaYAVIFssToDC0l"
)
api = API(method="GET", auth_handler=twitter_oauth)
tweets = api.user_timeline()
connection = connect_db()
try:
with connection.cursor() as cursor:
for tweet in tweets:
try:
if not user_exists(cursor, tweet['user']['id'], 'twitter_users'):
cursor.execute(
TWITTER_USER_INSERT,
(
tweet['user']['id'],
re.sub(r'[^\x00-\x7F]+',' ', tweet['user']['name']),
'',
re.sub(r'[^\x00-\x7F]+',' ', tweet['user']['description']),
tweet['user']['profile_link_color'],
tweet['user']['profile_sidebar_border_color'],
tweet['user']['favourites_count'],
0,
tweet['user']['profile_image_url'],
str(datetime.strftime(datetime.strptime(tweet['user']['created_at'], "%a %b %d %H:%M:%S %z %Y"), "%Y-%m-%d %H:%M:%S")),
tweet['user']['time_zone']
)
)
cursor.execute(
TWITTER_TWEET_INSERT,
(
tweet['user']['id'],
re.sub(r'[^\x00-\x7F]+',' ', tweet['text']),
tweet['retweet_count'],
str(datetime.strftime(datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S %z %Y"), "%Y-%m-%d %H:%M:%S")),
tweet['place']
)
)
except Exception as exc:
print("Error while populating row {row} :: {err}".format(row=row, err=exc))
finally:
connection.close()
if __name__ == '__main__':
process_data_set()
process_user_timeline_tweets()