我一直在尝试从tweet中提取URL,并检查重定向的数量,即URL重定向到的最终URL页面的元内容。 [一条推文可能包含多个URL ]
我已经使用pandas在Python中运行了相同的代码并将其拆分为大块,该代码自9天内开始执行。您有什么建议可以加快速度吗?
for chunk in pd.read_csv('BotData.csv', chunksize=3000):
Bot_Data1 = chunk
pd.options.mode.chained_assignment = None # default='warn'
##get urls from tweet text
Bot_Data1['base_urls']=Bot_Data1['text'].apply(lambda
row:re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+',str(row)))
Bot_Data1['urls']=Bot_Data1['text'].apply(lambda
row:re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[^
|^#|^https|^http]*',str(row)))
##get avg number of retweets for the number of URLs present
#clean urls
for i, value in enumerate(Bot_Data1['urls']):
Bot_Data1['urls'][i]= [s.replace('…', ' ') for s in value]
for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
numURL=0
i=0
finalurl=[]
for url in Bot_Data1['urls'][loop][:]:
numURL=numURL+1
i=i-1
f_url=''
# print(url)
try:
r = requests.get(url)
for h in r.history:
i=i+1
f_url=h.url
except Exception as e : # Catches wrong url error
print(e)
finalurl.append(f_url)
Bot_Data1['final_url'][loop]=np.array(finalurl,dtype=object)
if numURL!=0:
Bot_Data1['avg_redirections'][loop]=i/numURL
else:
Bot_Data1['avg_redirections'][loop]=0
for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
final_base_url=[]
for index, x in np.ndenumerate(Bot_Data1['final_url'][loop]):
final_base_url=final_base_url+(re.findall('https?://(?:[-\w.]|(?:%
[\da-fA-F]{2}))+',x))
Bot_Data1['final_base_url'][loop]=np.array(final_base_url,dtype=object)
##
##get url meta description content and title
import requests
import requests.exceptions
from bs4 import BeautifulSoup
Bot_Data1['url_meta_content']=''
for loop in range(min(Bot_Data1.index),max(Bot_Data1.index)):
metacontent=''
for i in range(0,len(Bot_Data1['final_base_url'][loop])):
url=Bot_Data1['final_base_url'][loop][i]
# print(url)
try:
response = requests.get(url)
soup = BeautifulSoup(response.text)
metas = soup.find_all('meta')
title = soup.find('title')
metacontent+=" "
metacontent+=''.join(map(str, [meta.attrs['content']
for meta in metas if 'name' in meta.attrs and meta.attrs['name']
== 'description'])) #converting the meta content to string as
the output is a list
# print([ meta.attrs['content'] for meta in metas if 'name' in
meta.attrs and meta.attrs['name'] == 'description' ])
# print(title.string)
metacontent+=" "
try:
metacontent+=title.text.strip()
except AttributeError as error:
# Output expected AttributeErrors.
print(error)
except Exception as e:
print(e)
i=i-1
Bot_Data1['url_meta_content'][loop]=metacontent
Bot_Data1['url_meta_content']=Bot_Data1['url_meta_content'].replace("/n","")
concatChunk=[BOT_Data,Bot_Data1]
BOT_Data=pd.concat(concatChunk)
##