I am using python 2.7.8. I dont know what really happend. Everything was going well but suddenly this error appeared. I really dont understand what is this. Search a lot but fail to resolve.
the full Error is:
IOError: [Errno 2] The system cannot find the path specified: '\\settings\\ads\\preferences?hl=en'
here is my code:
#!/usr/bin/env python
import re
import requests
import urllib
from bs4 import BeautifulSoup
def addtoindex(self, url, soup):
if self.isindexed (url): return
print 'Indexing ' + url
# Get the individual words
text = self.getTtextonly(url)
#print 't',text
words = self.separatewords(text)
#print 'words',words
if stem: words = pracstem.stem(words)
# Get the URL id
urlid = self.getentryid('googleurllist', 'url', url)
#print 'id',urlid
# Link each word to this url
for i in range(len(words)):
word = words[i]
# print 'w',word
if word in ignorewords: continue
wordid = self.getentryid('googlewordlist', 'word', word)
#print 'wordid',wordid
self.con.execute("insert into googlewordlocation(urlid, wordid, location) values('{0}', '{1}', '{2}')" .format(urlid, wordid, i))
self.con.commit()
def getTtextonly(self, soup):
url = soup
#url = "http://www.cplusplus.com/doc/tutorial/program_structure/"
html = urllib.urlopen(url).read() # compiler pointing error here
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style","a","<div id=\"bottom\" >"]):
script.extract() # rip it out
text = soup.findAll(text=True)
return text
def findfromGoogle(self,a):
page = requests.get("https://www.google.com/search?q="+a)
soup = BeautifulSoup(page.content)
links = soup.findAll("a")
for link in links:
if link['href'].startswith('/url?q=') \
and 'webcache.googleusercontent.com' not in link['href']:
q = link['href'].split('/url?q=')[1].split('&')[0]
#self.con.execute("insert into wordlocation(urlid, wordid, location) values(%i, %i, %i)" %(urlid, wordid, i))
# self.con.execute("insert into googleurllist (keyword,url,relevance,textcomplexity)VALUES('{0}','{1}','{2}','{3}')" .format(a,q,'',''))
# linkText = self.gettextonly(q)
#self.con.commit()
print "Records created successfully";
print q
self.addtoindex(q,soup)
linkText = self.getTtextonly(q)
Error:
File "C:\Users\DELL\Desktop\python\s\fyp\Relevancy\M\pyThinSearch\test.py", in getTtextonly
html = urllib.urlopen(url).read()
File "C:\Python27\lib\urllib.py", line 87, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 208, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 463, in open_file
return self.open_local_file(url)
File "C:\Python27\lib\urllib.py", line 477, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the path specified: '\\settings\\ads\\preferences?hl=en'
I am getting nervous and i really dont understand what error is really asking for....
答案 0 :(得分:1)
q = link['href'].split('/url?q=')[1].split('&')[0]
q
can be a relative URL.
If you see advertisements in https://www.google.com/search?q=apple, there is an a
element whose href
attribute starts with '/url?q=/settings/ads/preferences'.
According to the documentation of urllib.urlopen
,
If the URL does not have a scheme identifier, or if it has file: as its scheme identifier, this opens a local file (without universal newlines); otherwise it opens a socket to a server somewhere on the network.
You should use urlparse.urljoin
to make a URL absolute before passing it to urllib.urlopen
.