我正在编写一个脚本来抓取网络数据。这将创建用于存储每个页面结果的列表,然后将其附加到一个大列表中。 一切都工作正常,花花公子,直到我尝试通过将一个列表附加到另一个列表来完成最后一步。这是问题中的代码部分:
result = makeSearch(item)
#######################################################
#EVERYTHING IS STILL FINE WHEN YOU PRINT AT THIS POINT#
#######################################################
#printList(result)
##################################################
#APPENDING THE LIST TO THE LIST CREATES THE ERROR#
##################################################
adresses.append(result)
例如,这会将"Brückstr. 29"
变为"Brückstr.\xa029"
。我试图用string.replace('\\xa0', ' ')
将其删除,但无济于事。它没有做任何事情。
我有一种感觉,它与数字和字符的组合有关,但这并不能解释为什么只有当你试图将它附加到另一个列表时才会发生。
如果您尝试运行我的计划,请使用Aachen
或其他德国城市"Enter location: "
。
这是完整的计划:
import urllib.request
import time
import csv
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult, from_encoding='iso-8859-1')
return soup
#Returns all the result links from the given search parameters
def getLinksFromSearch(location):
database = []
links_unsortiert = []
#The search parameters
params = {
'subject': 'Taxi',
'location': location,
#'distance': '-1',
#'execute': 'Suchen',
#'suggest_choose': 'on',
#'radial_check': 'on',
}
DATA = urllib.parse.urlencode(params)
DATA = DATA.encode('iso-8859-1')
request = urllib.request.Request(
"http://www.gelbeseiten.de/yp/search.yp?subject=Taxi&location=" + location,
DATA)
# adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
soup = doRequest(request)
for link in soup.find_all('a'):
database.append(link.get('href'))
for item in database:
if item.startswith("http://adresse.gelbeseiten.de/"):
links_unsortiert.append(item)
links = list(set(links_unsortiert))
return links
#Performs a search on the link results
def searchOnLinks(links):
adresses = []
i = 1
j = len(links)
print("Gathering information, please wait...")
for item in links:
print("(" , i , "/" , j , ") Making request...")
result = makeSearch(item)
########################################
#EVERYTHING IS STILL FINE AT THIS POINT#
########################################
printList(result)
##################################################
#APPENDING THE LIST TO THE LIST CREATES THE ERROR#
##################################################
adresses.append(result)
for elem in adresses:
for element in elem:
element = element.replace('\xa0', ' ')
i = i + 1
time.sleep(0.3)
print("All done.")
return adresses
def makeSearch(link):
request = urllib.request.Request(link)
#Adding charset parameter to the Content-Type header.
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
#The search request
soup = doRequest(request)
name = ''
strasse = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
try:
fieldValue = soup.find(itemprop="name")
name = fieldValue.next_element
data.append(name)
except AttributeError:
print("Name not found!")
try:
fieldValue = soup.find(itemprop="streetAddress")
strasse = fieldValue.next_element
data.append(strasse)
except AttributeError:
print("Street not found!")
try:
fieldValue = soup.find(itemprop="postalCode")
plz = fieldValue.next_element
data.append(plz)
except AttributeError:
print("Zipcode not found!")
try:
fieldValue = soup.find(itemprop="addressLocality")
stadt = fieldValue.next_element
data.append(stadt)
except AttributeError:
print("City not found!")
return data
def printList(liste):
for element in liste:
print(element)
#The main input/output function
def inputOutput():
location = []
while True:
location = input("Enter location: ")
try:
links = getLinksFromSearch(location)
break
except urllib.error.HTTPError:
print("Error! Input raised an HTTP-Exception. Please enter valid input.")
#Checks if the search yielded any results
if len(links) > 0:
print("The search returned", len(links), "result(s).")
print('To proceed, enter "go".')
localVar = input('To do a new search, enter any key: ')
if localVar == 'go':
data = searchOnLinks(links)
printList(data)
saveData = input('Enter "save" if you want to save: ')
if saveData == 'save':
file_name = input("Save as: ")
print("Writing to file...")
with open(file_name + '.csv', 'w', newline='') as fp:
a = csv.writer(fp, delimiter=',')
a.writerows(data)
else:
return
else:
return
else:
print("The search returned no results.")
#Program entry point
def main():
while True:
inputOutput()
inputVar = input('If you want to run the application again, enter "y". To exit, enter any key: ')
if inputVar != 'y':
break
main()
答案 0 :(得分:0)
事实证明,当我将数据保存到.csv文件时,空格会正确显示,所以永远不要。