以下脚本包含while循环。首次运行正常。在尝试将对象添加到Render类时,第二次运行会强制脚本重新启动。我已经在第1次运行结束时尝试del (r)
,但它仍然无效。
我对编程很新,并且知道我的代码完全混乱,所以请善待。
对于2个输入,请使用:
1)'测试'和2)'Schluesselfertigbau'
U可能需要在运行之前更改已打开文件的路径。
import sys, re, pyperclip, requests, csv, fileinput, os
import bs4 as bs
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
from itertools import zip_longest
#Rendering the Webpage
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def scraping_yellow(inputUrl):
url = str(inputUrl)
#THIS IS WHERE IT CRASHS IN THE SECOND RUN
r = Render(url)
result = r.frame.toHtml()
#Converting QString to Ascii for lxml to process
formatted_result = str(result.encode('utf-8'))
#Next build lxml tree from formatted_result
tree = html.fromstring(formatted_result)
treeNoUTF8 = html.fromstring(result)
#getContent
name = treeNoUTF8.xpath('//span[@itemprop="name"]/text()')
street = treeNoUTF8.xpath('//span[@itemprop="streetAddress"]/text()')
zipcode = treeNoUTF8.xpath('//span[@itemprop="postalCode"]/text()')
town = treeNoUTF8.xpath('//span[@itemprop="addressLocality"]/text()')
distance = treeNoUTF8.xpath('//span[@class="teilnehmerentfernung"]/text()')
phone = treeNoUTF8.xpath('//span[@class="text nummer_ganz"]//span/text()')
#ListComprehension to make it clean
street = [str(w).replace('\xa0', ' ') for w in street]
distance = [str(w).replace('\xa0',' ') for w in distance]
def concatenate_list_data(list):
result2= ''
for element in list:
result2 += str(element)
return result2
#print('String: '+concatenate_list_data(phone))
suffix = "alltext"
with open("C://Python//Zwischenablage_{}.txt".format(suffix), "w") as out_f:
out_f.write(concatenate_list_data(phone))
fo = open("C://Python//Zwischenablage_{}.txt".format(suffix), 'r').read()
#patterns to search for
phoneRegex = re.compile(r"([\(][0-9]{4,5}[\)][\s]?[0-9]{1,10}[\s]?[0-9]{1,10}[\s]?[0-9]{1,3}[-]?[0-9]?)")
#emailRegex = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+(.[a-zA-Z]{2,4}))",re.VERBOSE)
#create lists out of matches
matches = []
for groups in phoneRegex.findall(fo):
matches.append(groups)
#for groups in emailRegex.findall(text):
# matches.append(groups)
if len(matches) > 0:
pyperclip.copy('\n'.join(matches))
else:
print('\n'+'No phone numbers or email addresses found.')
def remove_duplicates(values):
output = []
seen = set()
for value in values:
# If value has not been encountered yet,
# ... add it to both list and set.
if value not in seen:
output.append(value)
seen.add(value)
return output
matches1 = remove_duplicates(matches)
#show lists
print(name,'\n','\n', street,'\n','\n', zipcode,'\n','\n', town, '\n','\n', distance,'\n','\n', matches1)
#to check if lists are complete
print( '\n' + 'Prüfsumme - Elemente pro Liste:')
print(len(name),len(street),len(zipcode),len(town),len(distance),len(matches1))
#to write a csv
d = [name, street, zipcode, town, distance, matches1]
export_data = zip_longest(*d, fillvalue = '')
with open('C://Python/'+ending+'.csv', 'a', encoding="ISO-8859-1", newline='') as myfile:
wr = csv.writer(myfile, delimiter=';')
#wr.writerow(("Name", "Str.", "PLZ", "Ort","Entfernung","Telefon","Email"))
wr.writerows(export_data)
#to skip duplicates
seen = set() # set for fast O(1) amortized lookup
for line in fileinput.FileInput('C://Python/'+ending+'.csv', inplace=1):
if line in seen: continue # skip duplicate
seen.add(line)
print(line), # standard output is now redirected to the file
#to remove blanks and delete the temporary file
with open('C://Python/'+ending+'.csv') as input, open('C://Python/'+ending+' noblank.csv', 'w') as output:
non_blank = (line for line in input if line.strip())
output.writelines(non_blank)
os.remove('C://Python/'+ending+'.csv')
print(link1)
################# Start - Programm #################
ending = input('Bitte geben Sie den gewünschten Dateinamen ein: ')
link = 'https://www.gelbeseiten.de/'+input('Bitte geben Sie das gesuchte Gewerk ein!(ohne Umlaute)')+'/bergheim,,,,,umkreis-50000/s'
i=0
n = 4
#n = int(input('Bitte gib die Seitenanzahl ein: '))
while i<=n :
link1 = link+str(1+i)
print('Link: '+link1)
scraping_yellow(link1)
i = i + 1
else:
#to confirm printing
print('>>>> csv-Datei wurde beschrieben! <<<<')
当我使用输入时,我发现了另一个问题:1)'Test2'和2)'Fliesenleger'。由于某些数字未在源页面上注册,因此将其余数字分配错误(例如,phonenumbers不再与公司名称匹配)
我将不胜感激。