我有以下格式的约500万条推文:
console.error
我已将它们作为文本数据类型导入mysqldb中,现在我试图逐行拉出它们并清理它们以便我只能保留我需要的数据。
{"created_at":"Mon May 21 05:40:26 +0000 2018","id":998438346987683840,"id_str":"998438346987683840","text":"sometext","display_text_range":[0,0],"source":"u003ca href="someURL" rel="nofollow"u003eTwitter for iPhoneu003c/au003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1062745482,"id_str":"1062745482","name":"u3074u30fc","screen_name":"maga12171","location":null,"url":null,"description":"u3068u3063u3066u3082u30aau30c8u30cau3067u3059 u706bu661fu4ebauff08uff0buff09 u7652u3057u306fu95a2u30b8u30e3u30cb u6c34u66dcu3069u3046u3067u3057u3087u3046","translator_type":"none","protected":false,"verified":false,"followers_count":4,"friends_count":23,"listed_count":0,"favourites_count":977,"statuses_count":238,"created_at":"Sat Jan 05 11:09:11 +0000 2013","utc_offset":32400,"time_zone":"Tokyo","geo_enabled":false,"lang":"ja","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"someURL","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"0066FF","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http://pbs.twimg.com/profile_images/874952211184222209/UZ8RcGuU_normal.jpg","profile_image_url_https":"someURL","profile_banner_url":"someURL","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id":998438345532325888,"id_str":"998438345532325888","indices":[0,23],"media_url":"someURL","media_url_https":"someURL","url":"someURL","display_url":"pic.twitter.com/J1RJGazs8k","expanded_url":"someURL","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":750,"h":1334,"resize":"fit"},"small":{"w":382,"h":680,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":998438345532325888,"id_str":"998438345532325888","indices":[0,23],"media_url":"someURL","media_url_https":"someURL","url":"someURL","display_url":"pic.twitter.com/J1RJGazs8k","expanded_url":"someURL","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":750,"h":1334,"resize":"fit"},"small":{"w":382,"h":680,"resize":"fit"},"medium":{"w":675,"h":1200,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"und","timestamp_ms":"1526881226666"}
当我运行它时,我得到:import json
import MySQLdb.cursors
HOST = ""
USER = ""
PASSWD = ""
DB = ""
conn = MySQLdb.connect(host=HOST, port =3306, user=USER,passwd=PASSWD,db=DB , cursorclass= MySQLdb.cursors.SSCursor)
x = conn.cursor()
query = "SELECT txt_column FROM txt_data limit 1"
x.execute(query)
for row in x:
print(row)
res = json.loads(row[0])
,这是在ValueError: Expecting ',' delimiter: line 1 column 185 (char 184)
"找到的"
个字符。附加的json字符串。 stackoverflow中的一些帖子建议使用.replace()来摆脱"字符,但这会破坏json格式。
我认为问题在于python希望找到类似" attribute_name":" data"的格式。当它找到" attribute_name":"数据"引号中的数据"更多数据"它通过意外的错误"炭
如果我是对的,无论如何我能解决这个问题吗?
请注意我必须修改附加的json示例,用" someURL"替换所有URL,stackoverflow不允许URL。因此,您将无法在char 184处找到错误。在原始数据184中是第一个"在href =" someURL"
答案 0 :(得分:0)