我使用pickle通过转储root来保存对象图。当我加载根时,它具有所有实例变量和连接的对象节点。但是我将所有节点保存在字典类型的类变量中。在保存之前,类变量已满,但在取消数据后,它变为空。
这是我正在使用的课程:
class Page():
__crawled = {}
def __init__(self, title = '', link = '', relatedURLs = []):
self.__title = title
self.__link = link
self.__relatedURLs = relatedURLs
self.__related = []
@property
def relatedURLs(self):
return self.__relatedURLs
@property
def title(self):
return self.__title
@property
def related(self):
return self.__related
@property
def crawled(self):
return self.__crawled
def crawl(self,url):
if url not in self.__crawled:
webpage = urlopen(url).read()
patFinderTitle = re.compile('<title>(.*)</title>')
patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
patFinderRelated = re.compile('<li><a href="([^"]*)"')
findPatTitle = re.findall(patFinderTitle, webpage)
findPatLink = re.findall(patFinderLink, webpage)
findPatRelated = re.findall(patFinderRelated, webpage)
newPage = Page(findPatTitle,findPatLink,findPatRelated)
self.__related.append(newPage)
self.__crawled[url] = newPage
else:
self.__related.append(self.__crawled[url])
def crawlRelated(self):
for link in self.__relatedURLs:
self.crawl(link)
我这样保存:
with open('medTwiceGraph.dat','w') as outf:
pickle.dump(root,outf)
我加载它就像这样:
def loadGraph(filename): #returns root
with open(filename,'r') as inf:
return pickle.load(inf)
root = loadGraph('medTwiceGraph.dat')
除了类变量__crawled之外的所有数据加载。
我做错了什么?
答案 0 :(得分:5)
Python并不真正腌制类对象。它只是保存他们的名字以及在哪里找到它们。来自pickle
的文档:
同样,类被命名引用腌制,所以相同 适用于unpickling环境的限制。请注意无 该类的代码或数据被腌制,因此在下面的示例中 在unpickling环境中不会恢复类属性
attr
:class Foo: attr = 'a class attr' picklestring = pickle.dumps(Foo)
这些限制是可选功能和类必须的原因 在模块的顶层定义。
类似地,当类实例被腌制时,它们的类的代码和 数据不与它们一起腌制。只有实例数据 腌制。这是故意完成的,因此您可以修复类中的错误 向类添加方法并仍然加载使用它创建的对象 该类的早期版本。如果你打算长寿 看到很多版本的类的对象,可能是值得的 在对象中放置版本号以便进行适当的转换 可以通过班级的
__setstate__()
方法制作。
在您的示例中,您可以解决将__crawled
更改为实例属性或全局变量的问题。
答案 1 :(得分:3)
默认情况下,pickle只会使用self.__dict__
的内容,而不会使用您认为想要的self.__class__.__dict__
。
我说,“你认为你想要什么”,因为取消实例不应该改变班级状态。
如果您想更改此行为,请查看__getstate__
和__setstate__
in the docs
答案 2 :(得分:1)
对于任何感兴趣的人,我所做的是创建一个包含实例变量__crawled的超类Graph,并将我的爬行函数移动到Graph中。页面现在只包含描述页面及其相关页面的属性。我挑选了我的Graph实例,其中包含我所有的Page实例。这是我的代码。
from urllib import urlopen
#from bs4 import BeautifulSoup
import re
import pickle
###################CLASS GRAPH####################
class Graph(object):
def __init__(self,roots = [],crawled = {}):
self.__roots = roots
self.__crawled = crawled
@property
def roots(self):
return self.__roots
@property
def crawled(self):
return self.__crawled
def crawl(self,page,url):
if url not in self.__crawled:
webpage = urlopen(url).read()
patFinderTitle = re.compile('<title>(.*)</title>')
patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
patFinderRelated = re.compile('<li><a href="([^"]*)"')
findPatTitle = re.findall(patFinderTitle, webpage)
findPatLink = re.findall(patFinderLink, webpage)
findPatRelated = re.findall(patFinderRelated, webpage)
newPage = Page(findPatTitle,findPatLink,findPatRelated)
page.related.append(newPage)
self.__crawled[url] = newPage
else:
page.related.append(self.__crawled[url])
def crawlRelated(self,page):
for link in page.relatedURLs:
self.crawl(page,link)
def crawlAll(self,obj,limit = 2,i = 0):
print 'number of crawled pages:', len(self.crawled)
i += 1
if i > limit:
return
else:
for rel in obj.related:
print 'crawling', rel.title
self.crawlRelated(rel)
for rel2 in obj.related:
self.crawlAll(rel2,limit,i)
def loadGraph(self,filename):
with open(filename,'r') as inf:
return pickle.load(inf)
def saveGraph(self,obj,filename):
with open(filename,'w') as outf:
pickle.dump(obj,outf)
###################CLASS PAGE#####################
class Page(Graph):
def __init__(self, title = '', link = '', relatedURLs = []):
self.__title = title
self.__link = link
self.__relatedURLs = relatedURLs
self.__related = []
@property
def relatedURLs(self):
return self.__relatedURLs
@property
def title(self):
return self.__title
@property
def related(self):
return self.__related
####################### MAIN ######################
def main(seed):
print 'doing some work...'
webpage = urlopen(seed).read()
patFinderTitle = re.compile('<title>(.*)</title>')
patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
patFinderRelated = re.compile('<li><a href="([^"]*)"')
findPatTitle = re.findall(patFinderTitle, webpage)
findPatLink = re.findall(patFinderLink, webpage)
findPatRelated = re.findall(patFinderRelated, webpage)
print 'found the webpage', findPatTitle
#root = Page(findPatTitle,findPatLink,findPatRelated)
G = Graph([Page(findPatTitle,findPatLink,findPatRelated)])
print 'crawling related...'
G.crawlRelated(G.roots[0])
G.crawlAll(G.roots[0])
print 'now saving...'
G.saveGraph(G, 'medTwiceGraph.dat')
print 'done'
return G
#####################END MAIN######################
#'http://medtwice.com/am-i-pregnant/'
#'medTwiceGraph.dat'
#G = main('http://medtwice.com/menopause-overview/')
#print G.crawled
def loadGraph(filename):
with open(filename,'r') as inf:
return pickle.load(inf)
G = loadGraph('MedTwiceGraph.dat')
print G.roots[0].title
print G.roots[0].related
print G.crawled
for key in G.crawled:
print G.crawled[key].title
答案 3 :(得分:-2)
使用dill
可以解决此问题
dill
包裹:https://pypi.python.org/pypi/dill
参考:https://stackoverflow.com/a/28543378/6301132
根据Asker的代码,进入:
#notice:open the file in binary require
#save
with open('medTwiceGraph.dat','wb') as outf:
dill.dump(root,outf)
#load
def loadGraph(filename): #returns root
with open(filename,'rb') as inf:
return dill.load(inf)
root = loadGraph('medTwiceGraph.dat')
我写了另一个例子:
#Another example (with Python 3.x)
import dill
import os
class Employee:
def __init__ (self ,name='',contact={}) :
self.name = name
self.contact = contact
def print_self(self):
print(self.name, self.contact)
#save
def save_employees():
global emp
with open('employees.dat','wb') as fh:
dill.dump(emp,fh)
#load
def load_employees():
global emp
if os.path.exists('employees.dat'):
with open('employees.dat','rb') as fh:
emp=dill.load(fh)
#---
emp=[]
load_employees()
print('loaded:')
for tmpe in emp:
tmpe.print_self()
e=Employee() #new employee
if len(emp)==0:
e.name='Jack'
e.contact={'phone':'+086-12345678'}
elif len(emp)==1:
e.name='Jane'
e.contact={'phone':'+01-15555555','email':'a@b.com'}
else:
e.name='sb.'
e.contact={'telegram':'x'}
emp.append(e)
save_employees()