这是我遇到的一个超级奇怪的错误。我有一个python脚本生成一个JSON对象,我想使用JsonReponse
进行响应。这个响应成功返回,但有时它包含对象的多个部分,有时它包含一个,它只是在它发回的副本数量上下和整个地方。不知道这怎么可能。
这是一个网络抓取工具,可以为节点和图形边缘发送回JSON,并且重复的节点/边缘似乎随机回来。执行Web爬网并生成JSON的脚本每次都能正常工作,只有在向Django请求发生问题时才使用它。
我的views.py有一个这样的函数来处理Web爬虫脚本。
def webcrawler(request):
source = request.GET.get('source')
method = request.GET.get('method')
nodes = request.GET.get('nodes')
depth = request.GET.get('depth')
keyword = request.GET.get('keyword')
webcrawler = WebCrawler(source, method, nodes, depth, keyword)
data = webcrawler.jsonSerialize()
return JsonResponse(data, safe=False)
我的jsonSerialize()
函数如下所示:
def jsonSerialize(self):
for n in self.graph.nodes:
n.sourceNodes = []
self.graph.edges = list(self.graph.edges)
return json.dumps(self.graph, default=lambda o: o.__dict__)
为什么我会从此回复中获得随机数量的副本?我每次都会发出一个新请求,每次都会创建一个新对象,但似乎发回的对象数量会增加,有时会上升。如果我打开一个新选项卡并发出新请求,则会发生同样的事情。可能是什么原因造成的?
例如,如果我继续发出相同的请求webcrawler?source=http://www.google.com&method=BFS&nodes=5&depth=0&keyword=google
有时看起来像这样:
{
"nodes": [
{
"keyword": false,
"url": "http://www.google.com",
"sourceNodes": [],
"title": "Google",
"index": 0
},
{
"keyword": false,
"url": "http://www.google.com/imghp?hl=en&tab=wi",
"sourceNodes": [],
"title": "Google Images",
"index": 4
},
{
"keyword": false,
"url": "http://www.youtube.com/?tab=w1",
"sourceNodes": [],
"title": "YouTube",
"index": 3
},
{
"keyword": false,
"url": "http://news.google.com/nwshp?hl=en&tab=wn",
"sourceNodes": [],
"title": "Google News",
"index": 2
},
{
"keyword": false,
"url": "http://maps.google.com/maps?hl=en&tab=wl",
"sourceNodes": [],
"title": "Google Maps",
"index": 1
}
],
"edges": [
{
"source": 0,
"target": 1
},
{
"source": 0,
"target": 3
},
{
"source": 0,
"target": 2
},
{
"source": 0,
"target": 4
}
]
}
有时看起来像这样......有重复的节点和边缘。
{
"nodes": [
{
"keyword": false,
"url": "https://mail.google.com/mail/?tab=wm",
"sourceNodes": [],
"title": "Gmail",
"index": 1
},
{
"keyword": false,
"url": "https://www.google.com/intl/en/options/",
"sourceNodes": [],
"title": "\n Our Products | Google\n ",
"index": 7
},
{
"keyword": false,
"url": "http://www.google.com/imghp?hl=en&tab=wi",
"sourceNodes": [],
"title": "Google Images",
"index": 6
},
{
"keyword": false,
"url": "https://drive.google.com/?tab=wo",
"sourceNodes": [],
"title": "Meet Google Drive \u2013 One place for all your files",
"index": 2
},
{
"keyword": false,
"url": "http://news.google.com/nwshp?hl=en&tab=wn",
"sourceNodes": [],
"title": "Google News",
"index": 8
},
{
"keyword": false,
"url": "http://maps.google.com/maps?hl=en&tab=wl",
"sourceNodes": [],
"title": "Google Maps",
"index": 3
},
{
"keyword": true,
"url": "https://play.google.com/?hl=en&tab=w8",
"sourceNodes": [],
"title": "Google Play",
"index": 9
},
{
"keyword": false,
"url": "https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=http://www.google.com/",
"sourceNodes": [],
"title": "Sign in - Google Accounts",
"index": 4
},
{
"keyword": false,
"url": "http://www.google.com",
"sourceNodes": [],
"title": "Google",
"index": 0
},
{
"keyword": false,
"url": "http://www.google.com/history/optout?hl=en",
"sourceNodes": [],
"title": " Google - Search Customization ",
"index": 5
},
{
"keyword": false,
"url": "http://www.youtube.com/?tab=w1",
"sourceNodes": [],
"title": "YouTube",
"index": 3
}
],
"edges": [
{
"source": 0,
"target": 1
},
{
"source": 0,
"target": 7
},
{
"source": 0,
"target": 6
},
{
"source": 0,
"target": 5
},
{
"source": 0,
"target": 4
},
{
"source": 0,
"target": 9
},
{
"source": 0,
"target": 3
},
{
"source": 0,
"target": 8
},
{
"source": 0,
"target": 2
}
]
}
如果我多次使用相同的参数在本地运行python脚本,我不会看到这种不利行为。这让我觉得它与request / Django / JsonResponse有关,因为脚本似乎运行正常。
这是我的网络抓取工具的代码:
import urllib
# from urllib.request import urlopen
from urlparse import urlparse
from bs4 import BeautifulSoup
import requests
import collections
from Graph import Graph
from Node import Node
import sys
from time import gmtime, strftime
from timeout import timeout
from multiprocessing import Pool
from multiprocessing import Process
import json
import re
import pdb
class WebCrawler:
def __init__(self, originUrl, method, totalNodes, depthLimit=None, keyword=None):
self.originUrl = originUrl
self.method = method
self.totalNodes = int(totalNodes)
self.nodeCount = 0
self.depthLimit = int(depthLimit)
self.currentDepth = 0
self.keyword = keyword
self.keywordUrls = []
self.nodeUrlMap = {}
self.nodesToVisit = []
self.visitedUrls = set()
self.graph = Graph()
self.nodeIndex = 0
self.storeCookie()
originTitle = self.getTitle(originUrl)
startNode = Node(originUrl, None, originTitle)
self.crawl(startNode)
def crawl(self, node):
print("crawl(): " + strftime("%H:%M:%S", gmtime()))
visited = node.url in self.visitedUrls
if not visited:
self.graph.addNode(node, self.nodeIndex)
self.nodeIndex += 1
self.nodeCount += 1
self.visitedUrls.add(node.url)
if node.sourceNodes: # If this is not the starting node
sourceNode = node.sourceNodes.pop()
if sourceNode.index is not None and node.index is not None:
self.graph.addEdge(sourceNode.index, node.index) # Add an edge between sourceNode and node
if not visited:
soup = self.generateSoup(node.url)
hasKeyword = self.checkForKeyword(soup, node.url)
if hasKeyword:
node.keyword = True
links = self.findLinks(soup)
links = self.validLinks(links)
links = {l for l in links} # Remove duplicate links
if links:
if self.method == "BFS":
self.bfs(node, links)
else: # DFS
self.currentDepth += 1
if self.currentDepth >= self.depthLimit: # If depth limit reached, getNextNode (up a level)
self.currentDepth = 0 # Reset currentDepth
self.getNextNode()
else: # Otherwise, keep going deeper
self.dfs(node, links)
else: # No links present
self.getNextNode()
else: # Avoid infinite loop
self.getNextNode()
def validLinks(self, links):
print("validLinks(): " + strftime("%H:%M:%S", gmtime()))
validLinks = []
for link in links:
# Only add links while there is still room
if self.nodeCount + len(validLinks) <= self.totalNodes:
if self.isValidUrl(link):
validLinks.append(link)
return validLinks
def isValidUrl(self, url):
print("isValidUrl(): " + strftime("%H:%M:%S", gmtime()))
extensionBlacklist = ["zip", "dmg", "msi", "tar", "exe", "sisx"]
for x in extensionBlacklist:
if x in url:
return False
if "http" not in url: return False
parsed_url = urlparse(url)
if not bool(parsed_url.scheme): return False
try:
self.testRequest(url)
except:
return False
return True
@timeout(1)
def testRequest(self, url):
requests.head(url)
def getNextNode(self):
print("getNextNode(): " + strftime("%H:%M:%S", gmtime()))
if len(self.nodesToVisit) is not 0 and not self.nodeLimitReached():
# We use the same data structure to store urlsToVisit for BFS and DFS,
# and pop elements off the same way. How the elements are added is
# what's important.
nextNode = self.nodesToVisit.pop()
self.crawl(nextNode)
else: # Crawl is over
return
def printGraph(self):
for node in self.graph.nodes:
print("\nNode:")
if node.title:
print("Index: " + str(node.index))
print("Title: " + node.title)
print("URL: " + node.url)
print("Keyword: " + str(node.keyword))
if self.graph.edges:
print("\nEdges:")
edgeCount = 0
for e in self.graph.edges:
print("Source: " + str(e.source) + " Target: " + str(e.target))
if self.keywordUrls:
print("\nKeyword URLs:")
for k in self.keywordUrls:
print("URL: " + k)
print("\nJSON:")
print(self.jsonSerialize())
def jsonSerialize(self):
for n in self.graph.nodes:
n.sourceNodes = []
self.graph.edges = list(self.graph.edges)
self.graph.nodes = list(self.graph.nodes)
return json.dumps(self.graph, default=lambda o: o.__dict__)
def storeCookie(self):
# Store graph as cookie (do this one)
pass
def nodeLimitReached(self):
return self.nodeCount >= self.totalNodes
# Convert URL into soup
def generateSoup(self, url):
print("generateSoup(): " + strftime("%H:%M:%S", gmtime()))
sourceCode = requests.get(url)
plainText = sourceCode.text
soup = BeautifulSoup(plainText, "html.parser")
return soup
# Parse soup to find links
def findLinks(self, soup):
print("findLinks(): " + strftime("%H:%M:%S", gmtime()))
links = soup.findAll('a')
hrefs = []
for link in links:
href = link.get('href', '')
hrefs.append(href)
return hrefs
def getTitle(self, url):
print("getTitle(): " + strftime("%H:%M:%S", gmtime()))
soup = self.generateSoup(url)
title = soup.title # same as soup.find("title")
if title is not None:
return title.get_text()
def bfs(self, currentNode, links):
print("bfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
# If url is not already visited, and nodesToVisit+nodeCount hasn't exceeded totalNodes
if link not in self.visitedUrls and self.nodeCount + len(self.nodesToVisit) <= self.totalNodes:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, newNode)
self.nodeUrlMap[link] = newNode
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.insert(0, currentNode)
self.nodesToVisit.insert(0, existingNode)
self.getNextNode()
def dfs(self, currentNode, links):
print("dfs(): " + strftime("%H:%M:%S", gmtime()))
for link in links:
if link not in self.visitedUrls:
title = self.getTitle(link)
newNode = Node(link, [currentNode], title)
newNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(newNode)
elif link in self.nodeUrlMap: # Repeat URL, get existing node
existingNode = self.nodeUrlMap[link]
existingNode.sourceNodes.append(currentNode)
self.nodesToVisit.append(existingNode)
self.getNextNode()
def checkForKeyword(self, soup, url):
if self.keyword != "":
# If keyword found in soup, append url to keywordUrls
if soup.body and soup.body.find_all(string=re.compile('.*{0}.*'.format(self.keyword)), recursive=True):
self.keywordUrls.append(url)
return True
答案 0 :(得分:0)
我猜是WebCrawler
错误地初始化了它的graph
属性。而不是:
class WebCrawler:
graph = Something() # graph is a static class variable!!
尝试:
class WebCrawler:
def __init__(self, ...)
self.graph = Something() # graph is an instance variable
(如果不是 - 请检查graph.edges
/ graph.nodes
的初始化。