从约460万条推文记录中提取信息,python运行时问题

时间:2018-11-27 23:12:35

标签: python pandas dataframe

我一直在尝试从tweet中提取URL,并检查重定向的数量,即URL重定向到的最终URL页面的元内容。 [一条推文可能包含多个URL ]

我已经使用pandas在Python中运行了相同的代码并将其拆分为大块,该代码自9天内开始执行。您有什么建议可以加快速度吗?

for chunk in pd.read_csv('BotData.csv', chunksize=3000):
 Bot_Data1 = chunk 
 pd.options.mode.chained_assignment = None  # default='warn'
 ##get urls from tweet text
 Bot_Data1['base_urls']=Bot_Data1['text'].apply(lambda 
 row:re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+',str(row)))
 Bot_Data1['urls']=Bot_Data1['text'].apply(lambda 
 row:re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[^ 
 |^#|^https|^http]*',str(row)))

 ##get avg number of retweets for the number of URLs present
 #clean urls
 for i, value in enumerate(Bot_Data1['urls']): 
   Bot_Data1['urls'][i]= [s.replace('…', ' ') for s in value]
 for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
    numURL=0
    i=0
    finalurl=[]

    for url in Bot_Data1['urls'][loop][:]:
        numURL=numURL+1
        i=i-1
        f_url=''
#        print(url)
        try:
            r = requests.get(url)
            for h in r.history:
                i=i+1
                f_url=h.url                
        except Exception as e :    # Catches wrong url error
            print(e)

        finalurl.append(f_url)
     Bot_Data1['final_url'][loop]=np.array(finalurl,dtype=object)
     if numURL!=0:
        Bot_Data1['avg_redirections'][loop]=i/numURL
     else:
        Bot_Data1['avg_redirections'][loop]=0

 for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)): 
    final_base_url=[] 
    for index, x in np.ndenumerate(Bot_Data1['final_url'][loop]):
       final_base_url=final_base_url+(re.findall('https?://(?:[-\w.]|(?:% 
    [\da-fA-F]{2}))+',x))
    Bot_Data1['final_base_url'][loop]=np.array(final_base_url,dtype=object)
##        
##get url meta description content and title
import requests
import requests.exceptions

from bs4 import BeautifulSoup
Bot_Data1['url_meta_content']=''
for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
    metacontent=''
    for i in range(0,len(Bot_Data1['final_base_url'][loop])):
        url=Bot_Data1['final_base_url'][loop][i]
#        print(url)
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text)
            metas = soup.find_all('meta')

            title = soup.find('title')
            metacontent+=" "
            metacontent+=''.join(map(str, [meta.attrs['content']
            for meta in metas if 'name' in meta.attrs and meta.attrs['name'] 
            == 'description'])) #converting the meta content to string as 
            the output is a list
    #        print([ meta.attrs['content'] for meta in metas if 'name' in 
             meta.attrs and meta.attrs['name'] == 'description' ])
    #        print(title.string)
             metacontent+=" "
            try:
                metacontent+=title.text.strip()  

            except AttributeError as error:
    # Output expected AttributeErrors.
                print(error)

        except Exception as e:
            print(e)
            i=i-1

    Bot_Data1['url_meta_content'][loop]=metacontent

Bot_Data1['url_meta_content']=Bot_Data1['url_meta_content'].replace("/n","")


concatChunk=[BOT_Data,Bot_Data1]
BOT_Data=pd.concat(concatChunk)
##

0 个答案:

没有答案