您好我正在尝试使用谷歌应用引擎创建一个学术项目。 我希望通过google api实现的主要功能是搜索用户输入的查询并返回结果然后我可以对结果进行进一步分析。 xgoogle是我的第一次尝试,但它有导入错误(我可以在本地运行相同的代码)与google api引擎。即使是xgoogle作品,我也不知道如何获取不同网站的内容。 我想知道是否有任何方法可以将谷歌搜索结果作为文件返回。谢谢。
代码部分使用正则表达式来查找包含在html代码中的内容,但它不适用于具有不同html布局的网站。谢谢。
#!/usr/bin/env python
from html import HTML_PAGE
import webapp2
import jinja2
import os
import re
import sys
from os import walk
from google.appengine.api import search
from google.appengine.ext import ndb
from urllib import urlopen
from cgi import parse_qs
from xgoogle.search import GoogleSearch, SearchError
page = HTML_PAGE()
class MainPage(webapp2.RequestHandler):
def get(self):
self.response.out.write(page.pageChange())
class SearchFile():
def __init__(self,userInput=''):
self.__input = userInput
self.__result = {}
self.__files = []
self.__filenames =[]
for (dirpath, dirnames, filenames) in walk("dataFolder"):
for name in filenames:
path = dirpath+"/"+name
self.__files.append(path)
self.__filenames.append(name)
break
def outPutData(self):
iterator = range(0,len(self.__filenames))
for i in iterator:
with open(self.__files[i]) as f:
for line in f:
if self.__input in line:
self.__result[self.__filenames[i]] = line
break
f.close()
return self.__result
class SearchFileHandle(webapp2.RequestHandler):
def post(self):
userInput = str(self.request.get('input'))
self.response.out.write(page.pageChange(userInput))
search = SearchFile(userInput)
tramText = CropText()
for key,value in search.outPutData().iteritems():
keyBold = "<b>%s</b><br>"%(key)
keyLink = "<a href = \"dataFolder/%s\" name =\"%s\"> %s </a>"%(key,key,keyBold)
self.response.out.write(keyLink)
#print >>sys.stderr, "====>", re.search(regex, value,re.IGNORECASE)
resultContain = tramText.tram(value,userInput)
for word in resultContain.split(" "):
if word in userInput:
for keyWord in userInput.split(" "):
if word == keyWord:
self.response.out.write(" <b>%s</b> "%(word))
else:
self.response.out.write(" %s "%(word))
self.response.out.write("<br><br><br>")
news = TakeNews()
for key,value in news.websiteRead(userInput).iteritems():
keyBold = "<b>%s</b><br>"%(key)
keyLink = "<a href = %s> %s </a>"%(value,keyBold)
self.response.out.write(keyLink)
self.response.out.write("<br><br><br>")
googleSearch = WebSearch(userInput)
results = googleSearch.returnResult()
for res in results:
self.response.out.write(res)
self.response.out.write("<br><br><br>")
class CropText():
def tram(self,text,word):
regex = r"( .*? )"+re.escape(word)+r"( .*?\.)"
#print >>sys.stderr, text
if re.search(regex,text,re.IGNORECASE):
return re.search(regex,text,re.IGNORECASE).group()
else:
return ''
class TakeNews():
def __init__(self):
self.__website = 'http://www.bloomberg.com'
self.__topNews =''
self.__topNewsTitle =''
def setWebsite(self,website):
if (website[:10] != 'http://www') and not('http://www' in website):
website = 'http://www' + website
self.__website = website
def websiteRead(self,userInput):
webpage = urlopen(self.__website).read()
pathFinderTopNewsTitle = re.compile('<a class=\"icon-article-headline\".*<span class=\'headline\'>(.*)</span>')
pathFinderTopNews = re.compile('<a class=\"icon-article-headline\" data-id=.* data-type=.* href=\"(.*)\"><span class=\'headline\'>')
self.__topNewsTitle = re.findall(pathFinderTopNewsTitle,webpage)
self.__topNews = re.findall(pathFinderTopNews,webpage)
result = {}
iterator = range(0,len(self.__topNewsTitle))
for i in iterator:
if userInput in self.__topNews[i]:
result[self.__topNewsTitle[i]] = self.__website+"/"+self.__topNews[i]
return result
class WebSearch():
def __init__(self,word):
self.__search = word
def returnResult(self):
gs = GoogleSearch(self.__search)
gs.results_per_page = 200
return gs.get_results()
def main():
app.run()
app = webapp2.WSGIApplication([('/',MainPage),
('/searchFile',SearchFileHandle)
],
debug =True)
if __name__ == "__main__":
main()
答案 0 :(得分:0)
将bs4模块添加到app的文件夹