Question

问题在于：

我正在编写一个python程序，其目的是不断从RSS提要中收集新闻。我希望程序收集数据1周。问题是该程序永远不会到达本周末。有时它会在运行几天后冻结，有时几个小时甚至几分钟。它总是冻结，没有错误。当我说冻结时，我的意思是解释器似乎仍在运行，因为我无法给它任何额外的命令。我怎么解决这个问题？

我将发布以下代码。谢谢你们！

from goose import Goose
from requests import get
import urllib2
import feedparser
from urllib2 import urlopen
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import datetime as dt
import time
import os
Symbols=['AAPL','T','BA','XOM','GOOG','JPM','PG','WMT']
url='http://finance.yahoo.com/rss/headline?s='

for t in xrange(7):
    AAPL=pd.DataFrame()
    AAPL['Published']=""
    AAPL['Title']=""
    AAPL['link']=""
    AAPL['ID']=""
    AAPL['News']=""

    T=pd.DataFrame()
    T['Published']=""
    T['Title']=""
    T['link']=""
    T['ID']=""
    T['News']=""

    BA=pd.DataFrame()
    BA['Published']=""
    BA['Title']=""
    BA['link']=""
    BA['ID']=""
    BA['News']=""

    XOM=pd.DataFrame()
    XOM['Published']=""
    XOM['Title']=""
    XOM['link']=""
    XOM['ID']=""
    XOM['News']=""

    GOOG=pd.DataFrame()
    GOOG['Published']=""
    GOOG['Title']=""
    GOOG['link']=""
    GOOG['ID']=""
    GOOG['News']=""

    JPM=pd.DataFrame()
    JPM['Published']=""
    JPM['Title']=""
    JPM['link']=""
    JPM['ID']=""
    JPM['News']=""

    PG=pd.DataFrame()
    PG['Published']=""
    PG['Title']=""
    PG['link']=""
    PG['ID']=""
    PG['News']=""

    WMT=pd.DataFrame()
    WMT['Published']=""
    WMT['Title']=""
    WMT['link']=""
    WMT['ID']=""
    WMT['News']=""




    DaysIDsAAPL=[]
    DaysIDsT=[]
    DaysIDsBA=[]
    DaysIDsXOM=[]
    DaysIDsGOOG=[]
    DaysIDsJPM=[]
    DaysIDsPG=[]
    DaysIDsWMT=[]



    count=0

    AAPLCount=0
    TCount=0
    BACount=0
    XOMCount=0
    GOOGCount=0
    JPMCount=0
    PGCount=0
    WMTCount=0

    date=dt.date.today()

    newpathAAPL = r'D:\News Data\AAPL\\'+str(t)
    newpathT = r'D:\News Data\T\\'+str(t)
    newpathBA = r'D:\News Data\BA\\'+str(t)
    newpathXOM = r'D:\News Data\XOM\\'+str(t)
    newpathGOOG = r'D:\News Data\GOOG\\'+str(t)
    newpathJPM = r'D:\News Data\JPM\\'+str(t)
    newpathPG = r'D:\News Data\PG\\'+str(t)
    newpathWMT = r'D:\News Data\WMT\\'+str(t)
    os.makedirs(newpathAAPL)
    os.makedirs(newpathT)
    os.makedirs(newpathBA)
    os.makedirs(newpathXOM)
    os.makedirs(newpathGOOG)
    os.makedirs(newpathJPM)
    os.makedirs(newpathPG)
    os.makedirs(newpathWMT)
    while dt.date.today()==date:
        print "Loop"
        try:
        #AAPL inner most loop
            d1=feedparser.parse(url+Symbols[0])  
            for x in xrange(len(d1['entries'])):
                if int(d1.entries[x]['id'][14:]) not in DaysIDsAAPL:
                    DaysIDsAAPL.append(int(d1.entries[x]['id'][14:]))
                    y = len(AAPL.index.tolist())
                    m=re.search(r'\*(.*)',d1.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    AAPL.loc[y,'Title'] =d1.entries[x]['title'].encode('utf8')
                    AAPL.loc[y,'link'] =m.encode('utf8')
                    AAPL.loc[y,'Published'] =d1.entries[x]['published'].encode('utf8')
                    AAPL.loc[y,'ID'] =int(d1.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            AAPL.loc[y,'News'] = AAPLCount
                            AAPLCount+=1
                            AAPL=AAPL.fillna("")
                            AAPL.to_csv(newpathAAPL+r'\Key.csv')
                        except:
                            print m
                            print "AAPL"
                    else:
                        Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        AAPL.loc[y,'News'] =AAPLCount
                        AAPLCount+=1
                        AAPL=AAPL.fillna("")
                        AAPL.to_csv(newpathAAPL+r'\Key.csv')
                    print "AAPL"

            #T inner most loop
            d2=feedparser.parse(url+Symbols[1])

            for x in xrange(len(d2['entries'])):
                if int(d2.entries[x]['id'][14:]) not in DaysIDsT:
                    DaysIDsT.append(int(d2.entries[x]['id'][14:]))
                    y = len(T.index.tolist())
                    m=re.search(r'\*(.*)',d2.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    T.loc[y,'Title'] =d2.entries[x]['title'].encode('utf8')
                    T.loc[y,'link'] =m.encode('utf8')
                    T.loc[y,'Published'] =d2.entries[x]['published'].encode('utf8')
                    T.loc[y,'ID'] =int(d2.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            T.loc[y,'News'] = TCount
                            TCount+=1
                            T=T.fillna("")
                            T.to_csv(newpathT+r'\Key.csv')
                        except:
                            print m
                            print "T"
                    else:
                        Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        T.loc[y,'News'] =TCount
                        TCount+=1
                        T=T.fillna("")
                        T.to_csv(newpathT+r'\Key.csv')
                    print "T"

            #BA inner most loop
            d3=feedparser.parse(url+Symbols[2])

            for x in xrange(len(d3['entries'])):
                if int(d3.entries[x]['id'][14:]) not in DaysIDsBA:
                    DaysIDsBA.append(int(d3.entries[x]['id'][14:]))
                    y = len(BA.index.tolist())
                    m=re.search(r'\*(.*)',d3.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    BA.loc[y,'Title'] =d3.entries[x]['title'].encode('utf8')
                    BA.loc[y,'link'] =m.encode('utf8')
                    BA.loc[y,'Published'] =d3.entries[x]['published'].encode('utf8')
                    BA.loc[y,'ID'] =int(d3.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            BA.loc[y,'News'] = BACount
                            BACount+=1
                            BA=BA.fillna("")
                            BA.to_csv(newpathBA+r'\Key.csv')
                        except:
                            print m
                            print "BA"
                    else:
                        Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        BA.loc[y,'News'] =BACount
                        BACount+=1
                        BA=BA.fillna("")
                        BA.to_csv(newpathBA+r'\Key.csv')
                    print "BA"

            #XOM inner most loop
            d4=feedparser.parse(url+Symbols[3])

            for x in xrange(len(d4['entries'])):
                if int(d4.entries[x]['id'][14:]) not in DaysIDsXOM:
                    DaysIDsXOM.append(int(d4.entries[x]['id'][14:]))
                    y = len(XOM.index.tolist())
                    m=re.search(r'\*(.*)',d4.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    XOM.loc[y,'Title'] =d4.entries[x]['title'].encode('utf8')
                    XOM.loc[y,'link'] =m.encode('utf8')
                    XOM.loc[y,'Published'] =d4.entries[x]['published'].encode('utf8')
                    XOM.loc[y,'ID'] =int(d4.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            XOM.loc[y,'News'] = XOMCount
                            XOMCount+=1
                            XOM=XOM.fillna("")
                            XOM.to_csv(newpathXOM+r'\Key.csv')
                        except:
                            print m
                            print "XOM"
                    else:
                        Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        XOM.loc[y,'News'] =XOMCount
                        XOMCount+=1
                        XOM=XOM.fillna("")
                        XOM.to_csv(newpathXOM+r'\Key.csv')

            #GOOG inner most loop
            d5=feedparser.parse(url+Symbols[4])

            for x in xrange(len(d5['entries'])):
                if int(d5.entries[x]['id'][14:]) not in DaysIDsGOOG:
                    DaysIDsGOOG.append(int(d5.entries[x]['id'][14:]))
                    y = len(GOOG.index.tolist())
                    m=re.search(r'\*(.*)',d5.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    GOOG.loc[y,'Title'] =d5.entries[x]['title'].encode('utf8')
                    GOOG.loc[y,'link'] =m.encode('utf8')
                    GOOG.loc[y,'Published'] =d5.entries[x]['published'].encode('utf8')
                    GOOG.loc[y,'ID'] =int(d5.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            GOOG.loc[y,'News'] = GOOGCount
                            GOOGCount+=1
                            GOOG=GOOG.fillna("")
                            GOOG.to_csv(newpathGOOG+r'\Key.csv')
                        except:
                            print m
                            print "GOOG"
                    else:
                        Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        GOOG.loc[y,'News'] =GOOGCount
                        GOOGCount+=1
                        GOOG=GOOG.fillna("")
                        GOOG.to_csv(newpathGOOG+r'\Key.csv')
                    print "GOOG"

            #JPM inner most loop
            d6=feedparser.parse(url+Symbols[5])

            for x in xrange(len(d6['entries'])):
                if int(d6.entries[x]['id'][14:]) not in DaysIDsJPM:
                    DaysIDsJPM.append(int(d6.entries[x]['id'][14:]))
                    y = len(JPM.index.tolist())
                    m=re.search(r'\*(.*)',d6.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    JPM.loc[y,'Title'] =d6.entries[x]['title'].encode('utf8')
                    JPM.loc[y,'link'] =m.encode('utf8')
                    JPM.loc[y,'Published'] =d6.entries[x]['published'].encode('utf8')
                    JPM.loc[y,'ID'] =int(d6.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == '':
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            JPM.loc[y,'News'] = JPMCount
                            JPMCount+=1
                            JPM=JPM.fillna("")
                            JPM.to_csv(newpathJPM+r'\Key.csv')
                        except:
                            print m
                            print "JPM"
                    else:
                        Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        JPM.loc[y,'News'] =JPMCount
                        JPMCount+=1
                        JPM=JPM.fillna("")
                        JPM.to_csv(newpathJPM+r'\Key.csv')
                    print "JPM"


            #PG inner most loop
            d7=feedparser.parse(url+Symbols[6])

            for x in xrange(len(d7['entries'])):
                if int(d7.entries[x]['id'][14:]) not in DaysIDsPG:
                    DaysIDsPG.append(int(d7.entries[x]['id'][14:]))
                    y = len(PG.index.tolist())
                    m=re.search(r'\*(.*)',d7.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    PG.loc[y,'Title'] =d7.entries[x]['title'].encode('utf8')
                    PG.loc[y,'link'] =m.encode('utf8')
                    PG.loc[y,'Published'] =d7.entries[x]['published'].encode('utf8')
                    PG.loc[y,'ID'] =int(d7.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == "":
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            PG.loc[y,'News'] = PGCount
                            PGCount+=1
                            PG=PG.fillna("")
                            PG.to_csv(newpathPG+r'\Key.csv')
                        except:
                            print m
                            print "PG"
                    else:
                        Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        PG.loc[y,'News'] =PGCount
                        PGCount+=1
                        PG=PG.fillna("")
                        PG.to_csv(newpathPG+r'\Key.csv')
                    print "PG"


            #WMT inner most loop
            d8=feedparser.parse(url+Symbols[7])

            for x in xrange(len(d8['entries'])):
                if int(d8.entries[x]['id'][14:]) not in DaysIDsWMT:
                    DaysIDsWMT.append(int(d8.entries[x]['id'][14:]))
                    y = len(WMT.index.tolist())
                    m=re.search(r'\*(.*)',d8.entries[x]['link'])
                    z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
                    if type(z) is not None:
                        m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
                    WMT.loc[y,'Title'] =d8.entries[x]['title'].encode('utf8')
                    WMT.loc[y,'link'] =m.encode('utf8')
                    WMT.loc[y,'Published'] =d8.entries[x]['published'].encode('utf8')
                    WMT.loc[y,'ID'] =int(d8.entries[x]['id'][14:])
                    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
                    page = get(m,headers=hdr)
                    extractor = Goose()
                    article = extractor.extract(raw_html=page.text)
                    text = article.cleaned_text.encode('utf8')
                    if text == "":
                        try:
                            url2 = m
                            req = urllib2.Request(url2, None, hdr)
                            html2 = urlopen(req).read().decode('utf8')
                            raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
                            Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
                            Text_file.write(raw)
                            Text_file.close()
                            WMT.loc[y,'News'] = WMTCount
                            WMTCount+=1
                            WMT=WMT.fillna("")
                            WMT.to_csv(newpathWMT+r'\Key.csv')
                        except:
                            print m
                            print "WMT"
                    else:
                        Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
                        Text_file.write(text)
                        Text_file.close()
                        WMT.loc[y,'News'] =WMTCount
                        WMTCount+=1
                        WMT=WMT.fillna("")
                        WMT.to_csv(newpathWMT+r'\Key.csv')
                    print "WMT"       
            count+=1
            print count
            time.sleep(1)
        except:
            print "Error"
    AAPL=AAPL.fillna("")
    AAPL.to_csv(newpathAAPL+r'\Key.csv')
    T=T.fillna("")
    T.to_csv(newpathT+r'\Key.csv')
    BA=BA.fillna("")
    BA.to_csv(newpathBA+r'\Key.csv')
    XOM=XOM.fillna("")
    XOM.to_csv(newpathXOM+r'\Key.csv')
    GOOG=GOOG.fillna("")
    GOOG.to_csv(newpathGOOG+r'\Key.csv')
    JPM=JPM.fillna("")
    JPM.to_csv(newpathJPM+r'\Key.csv')
    PG=PG.fillna("")
    PG.to_csv(newpathPG+r'\Key.csv')
    WMT=WMT.fillna("")
    WMT.to_csv(newpathWMT+r'\Key.csv')

Answer 1

程序中的

在收集过多的Feed或者系统上有其他活动进程时会消耗太多RAM（这就是冻结时间不同的原因），请参阅Why does a simple python script crash my system

程序运行的过程在进程内存中存储用于计算的数组和变量，这是ram

您可以通过强制程序使用硬盘内存来解决此问题。

对于变通方法（shelve，定期将收集的Feed保存到文本文件（将信息从ram移动到rom并释放ram），...）请参阅以下链接

memory usage, how to free memory

Python large variable RAM usage

I need to free up RAM by storing a Python dictionary on the hard drive, not in RAM. Is it possible?

运行几天后，python程序冻结

1 个答案: