问题在于:
我正在编写一个python程序,其目的是不断从RSS提要中收集新闻。我希望程序收集数据1周。问题是该程序永远不会到达本周末。有时它会在运行几天后冻结,有时几个小时甚至几分钟。它总是冻结,没有错误。当我说冻结时,我的意思是解释器似乎仍在运行,因为我无法给它任何额外的命令。我怎么解决这个问题?
我将发布以下代码。谢谢你们!
from goose import Goose
from requests import get
import urllib2
import feedparser
from urllib2 import urlopen
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import datetime as dt
import time
import os
Symbols=['AAPL','T','BA','XOM','GOOG','JPM','PG','WMT']
url='http://finance.yahoo.com/rss/headline?s='
for t in xrange(7):
AAPL=pd.DataFrame()
AAPL['Published']=""
AAPL['Title']=""
AAPL['link']=""
AAPL['ID']=""
AAPL['News']=""
T=pd.DataFrame()
T['Published']=""
T['Title']=""
T['link']=""
T['ID']=""
T['News']=""
BA=pd.DataFrame()
BA['Published']=""
BA['Title']=""
BA['link']=""
BA['ID']=""
BA['News']=""
XOM=pd.DataFrame()
XOM['Published']=""
XOM['Title']=""
XOM['link']=""
XOM['ID']=""
XOM['News']=""
GOOG=pd.DataFrame()
GOOG['Published']=""
GOOG['Title']=""
GOOG['link']=""
GOOG['ID']=""
GOOG['News']=""
JPM=pd.DataFrame()
JPM['Published']=""
JPM['Title']=""
JPM['link']=""
JPM['ID']=""
JPM['News']=""
PG=pd.DataFrame()
PG['Published']=""
PG['Title']=""
PG['link']=""
PG['ID']=""
PG['News']=""
WMT=pd.DataFrame()
WMT['Published']=""
WMT['Title']=""
WMT['link']=""
WMT['ID']=""
WMT['News']=""
DaysIDsAAPL=[]
DaysIDsT=[]
DaysIDsBA=[]
DaysIDsXOM=[]
DaysIDsGOOG=[]
DaysIDsJPM=[]
DaysIDsPG=[]
DaysIDsWMT=[]
count=0
AAPLCount=0
TCount=0
BACount=0
XOMCount=0
GOOGCount=0
JPMCount=0
PGCount=0
WMTCount=0
date=dt.date.today()
newpathAAPL = r'D:\News Data\AAPL\\'+str(t)
newpathT = r'D:\News Data\T\\'+str(t)
newpathBA = r'D:\News Data\BA\\'+str(t)
newpathXOM = r'D:\News Data\XOM\\'+str(t)
newpathGOOG = r'D:\News Data\GOOG\\'+str(t)
newpathJPM = r'D:\News Data\JPM\\'+str(t)
newpathPG = r'D:\News Data\PG\\'+str(t)
newpathWMT = r'D:\News Data\WMT\\'+str(t)
os.makedirs(newpathAAPL)
os.makedirs(newpathT)
os.makedirs(newpathBA)
os.makedirs(newpathXOM)
os.makedirs(newpathGOOG)
os.makedirs(newpathJPM)
os.makedirs(newpathPG)
os.makedirs(newpathWMT)
while dt.date.today()==date:
print "Loop"
try:
#AAPL inner most loop
d1=feedparser.parse(url+Symbols[0])
for x in xrange(len(d1['entries'])):
if int(d1.entries[x]['id'][14:]) not in DaysIDsAAPL:
DaysIDsAAPL.append(int(d1.entries[x]['id'][14:]))
y = len(AAPL.index.tolist())
m=re.search(r'\*(.*)',d1.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
AAPL.loc[y,'Title'] =d1.entries[x]['title'].encode('utf8')
AAPL.loc[y,'link'] =m.encode('utf8')
AAPL.loc[y,'Published'] =d1.entries[x]['published'].encode('utf8')
AAPL.loc[y,'ID'] =int(d1.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
AAPL.loc[y,'News'] = AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
except:
print m
print "AAPL"
else:
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
AAPL.loc[y,'News'] =AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
print "AAPL"
#T inner most loop
d2=feedparser.parse(url+Symbols[1])
for x in xrange(len(d2['entries'])):
if int(d2.entries[x]['id'][14:]) not in DaysIDsT:
DaysIDsT.append(int(d2.entries[x]['id'][14:]))
y = len(T.index.tolist())
m=re.search(r'\*(.*)',d2.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
T.loc[y,'Title'] =d2.entries[x]['title'].encode('utf8')
T.loc[y,'link'] =m.encode('utf8')
T.loc[y,'Published'] =d2.entries[x]['published'].encode('utf8')
T.loc[y,'ID'] =int(d2.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
T.loc[y,'News'] = TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
except:
print m
print "T"
else:
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
T.loc[y,'News'] =TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
print "T"
#BA inner most loop
d3=feedparser.parse(url+Symbols[2])
for x in xrange(len(d3['entries'])):
if int(d3.entries[x]['id'][14:]) not in DaysIDsBA:
DaysIDsBA.append(int(d3.entries[x]['id'][14:]))
y = len(BA.index.tolist())
m=re.search(r'\*(.*)',d3.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
BA.loc[y,'Title'] =d3.entries[x]['title'].encode('utf8')
BA.loc[y,'link'] =m.encode('utf8')
BA.loc[y,'Published'] =d3.entries[x]['published'].encode('utf8')
BA.loc[y,'ID'] =int(d3.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
BA.loc[y,'News'] = BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
except:
print m
print "BA"
else:
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(text)
Text_file.close()
BA.loc[y,'News'] =BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
print "BA"
#XOM inner most loop
d4=feedparser.parse(url+Symbols[3])
for x in xrange(len(d4['entries'])):
if int(d4.entries[x]['id'][14:]) not in DaysIDsXOM:
DaysIDsXOM.append(int(d4.entries[x]['id'][14:]))
y = len(XOM.index.tolist())
m=re.search(r'\*(.*)',d4.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
XOM.loc[y,'Title'] =d4.entries[x]['title'].encode('utf8')
XOM.loc[y,'link'] =m.encode('utf8')
XOM.loc[y,'Published'] =d4.entries[x]['published'].encode('utf8')
XOM.loc[y,'ID'] =int(d4.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
XOM.loc[y,'News'] = XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
except:
print m
print "XOM"
else:
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
XOM.loc[y,'News'] =XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
#GOOG inner most loop
d5=feedparser.parse(url+Symbols[4])
for x in xrange(len(d5['entries'])):
if int(d5.entries[x]['id'][14:]) not in DaysIDsGOOG:
DaysIDsGOOG.append(int(d5.entries[x]['id'][14:]))
y = len(GOOG.index.tolist())
m=re.search(r'\*(.*)',d5.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
GOOG.loc[y,'Title'] =d5.entries[x]['title'].encode('utf8')
GOOG.loc[y,'link'] =m.encode('utf8')
GOOG.loc[y,'Published'] =d5.entries[x]['published'].encode('utf8')
GOOG.loc[y,'ID'] =int(d5.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
GOOG.loc[y,'News'] = GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
except:
print m
print "GOOG"
else:
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
GOOG.loc[y,'News'] =GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
print "GOOG"
#JPM inner most loop
d6=feedparser.parse(url+Symbols[5])
for x in xrange(len(d6['entries'])):
if int(d6.entries[x]['id'][14:]) not in DaysIDsJPM:
DaysIDsJPM.append(int(d6.entries[x]['id'][14:]))
y = len(JPM.index.tolist())
m=re.search(r'\*(.*)',d6.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
JPM.loc[y,'Title'] =d6.entries[x]['title'].encode('utf8')
JPM.loc[y,'link'] =m.encode('utf8')
JPM.loc[y,'Published'] =d6.entries[x]['published'].encode('utf8')
JPM.loc[y,'ID'] =int(d6.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
JPM.loc[y,'News'] = JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
except:
print m
print "JPM"
else:
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
JPM.loc[y,'News'] =JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
print "JPM"
#PG inner most loop
d7=feedparser.parse(url+Symbols[6])
for x in xrange(len(d7['entries'])):
if int(d7.entries[x]['id'][14:]) not in DaysIDsPG:
DaysIDsPG.append(int(d7.entries[x]['id'][14:]))
y = len(PG.index.tolist())
m=re.search(r'\*(.*)',d7.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
PG.loc[y,'Title'] =d7.entries[x]['title'].encode('utf8')
PG.loc[y,'link'] =m.encode('utf8')
PG.loc[y,'Published'] =d7.entries[x]['published'].encode('utf8')
PG.loc[y,'ID'] =int(d7.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
PG.loc[y,'News'] = PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
except:
print m
print "PG"
else:
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
PG.loc[y,'News'] =PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
print "PG"
#WMT inner most loop
d8=feedparser.parse(url+Symbols[7])
for x in xrange(len(d8['entries'])):
if int(d8.entries[x]['id'][14:]) not in DaysIDsWMT:
DaysIDsWMT.append(int(d8.entries[x]['id'][14:]))
y = len(WMT.index.tolist())
m=re.search(r'\*(.*)',d8.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
WMT.loc[y,'Title'] =d8.entries[x]['title'].encode('utf8')
WMT.loc[y,'link'] =m.encode('utf8')
WMT.loc[y,'Published'] =d8.entries[x]['published'].encode('utf8')
WMT.loc[y,'ID'] =int(d8.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
WMT.loc[y,'News'] = WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
except:
print m
print "WMT"
else:
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
WMT.loc[y,'News'] =WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
print "WMT"
count+=1
print count
time.sleep(1)
except:
print "Error"
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
答案 0 :(得分:1)
在收集过多的Feed或者系统上有其他活动进程时会消耗太多RAM(这就是冻结时间不同的原因),请参阅Why does a simple python script crash my system
程序运行的过程在进程内存中存储用于计算的数组和变量,这是ram
您可以通过强制程序使用硬盘内存来解决此问题。
对于变通方法(shelve
,定期将收集的Feed保存到文本文件(将信息从ram移动到rom并释放ram),...)请参阅以下链接
memory usage, how to free memory
Python large variable RAM usage
I need to free up RAM by storing a Python dictionary on the hard drive, not in RAM. Is it possible?