while循环第二次运行强制shell重启

时间:2018-01-05 13:35:09

标签: python while-loop web-scraping pyqt4 lxml

以下脚本包含while循环。首次运行正常。在尝试将对象添加到Render类时,第二次运行会强制脚本重新启动。我已经在第1次运行结束时尝试del (r),但它仍然无效。 我对编程很新,并且知道我的代码完全混乱,所以请善待。

对于2个输入,请使用:

1)'测试'和2)'Schluesselfertigbau'

U可能需要在运行之前更改已打开文件的路径。

import sys, re, pyperclip, requests, csv, fileinput, os
import bs4 as bs
from PyQt4.QtGui import *  
from PyQt4.QtCore import *  
from PyQt4.QtWebKit import *  
from lxml import html
from itertools import zip_longest


#Rendering the Webpage                                   
class Render(QWebPage):  
  def __init__(self, url):  
    self.app = QApplication(sys.argv)  
    QWebPage.__init__(self)  
    self.loadFinished.connect(self._loadFinished)  
    self.mainFrame().load(QUrl(url))  
    self.app.exec_()  

  def _loadFinished(self, result):  
    self.frame = self.mainFrame()  
    self.app.quit()  

def scraping_yellow(inputUrl):
  url = str(inputUrl)
  #THIS IS WHERE IT CRASHS IN THE SECOND RUN
  r = Render(url)  
  result = r.frame.toHtml()

  #Converting QString to Ascii for lxml to process
  formatted_result = str(result.encode('utf-8'))

  #Next build lxml tree from formatted_result
  tree = html.fromstring(formatted_result)
  treeNoUTF8 = html.fromstring(result)

  #getContent
  name = treeNoUTF8.xpath('//span[@itemprop="name"]/text()')
  street = treeNoUTF8.xpath('//span[@itemprop="streetAddress"]/text()')
  zipcode = treeNoUTF8.xpath('//span[@itemprop="postalCode"]/text()')
  town = treeNoUTF8.xpath('//span[@itemprop="addressLocality"]/text()')
  distance = treeNoUTF8.xpath('//span[@class="teilnehmerentfernung"]/text()')
  phone = treeNoUTF8.xpath('//span[@class="text nummer_ganz"]//span/text()')

  #ListComprehension to make it clean
  street = [str(w).replace('\xa0', ' ') for w in street]
  distance = [str(w).replace('\xa0',' ') for w in distance]

  def concatenate_list_data(list):
      result2= ''
      for element in list:
          result2 += str(element)
      return result2

  #print('String: '+concatenate_list_data(phone))

  suffix = "alltext"
  with open("C://Python//Zwischenablage_{}.txt".format(suffix), "w") as out_f:
      out_f.write(concatenate_list_data(phone))
  fo = open("C://Python//Zwischenablage_{}.txt".format(suffix), 'r').read()

  #patterns to search for
  phoneRegex = re.compile(r"([\(][0-9]{4,5}[\)][\s]?[0-9]{1,10}[\s]?[0-9]{1,10}[\s]?[0-9]{1,3}[-]?[0-9]?)")                    
  #emailRegex = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+(.[a-zA-Z]{2,4}))",re.VERBOSE)

  #create lists out of matches
  matches = []   
  for groups in phoneRegex.findall(fo):
     matches.append(groups)
  #for groups in emailRegex.findall(text):   
  #   matches.append(groups)

  if len(matches) > 0:
     pyperclip.copy('\n'.join(matches))

  else:
     print('\n'+'No phone numbers or email addresses found.')

  def remove_duplicates(values):
      output = []
      seen = set()
      for value in values:
          # If value has not been encountered yet,
          # ... add it to both list and set.
          if value not in seen:
              output.append(value)
              seen.add(value)
      return output

  matches1 = remove_duplicates(matches)


  #show lists
  print(name,'\n','\n', street,'\n','\n', zipcode,'\n','\n', town, '\n','\n', distance,'\n','\n', matches1)

  #to check if lists are complete
  print( '\n' + 'Prüfsumme - Elemente pro Liste:')
  print(len(name),len(street),len(zipcode),len(town),len(distance),len(matches1))


  #to write a csv
  d = [name, street, zipcode, town, distance, matches1]
  export_data = zip_longest(*d, fillvalue = '')
  with open('C://Python/'+ending+'.csv', 'a', encoding="ISO-8859-1", newline='') as myfile:
        wr = csv.writer(myfile, delimiter=';')
        #wr.writerow(("Name", "Str.", "PLZ", "Ort","Entfernung","Telefon","Email"))
        wr.writerows(export_data)

  #to skip duplicates  
  seen = set() # set for fast O(1) amortized lookup
  for line in fileinput.FileInput('C://Python/'+ending+'.csv', inplace=1):
      if line in seen: continue # skip duplicate

      seen.add(line)
      print(line), # standard output is now redirected to the file

  #to remove blanks and delete the temporary file
  with open('C://Python/'+ending+'.csv') as input, open('C://Python/'+ending+' noblank.csv', 'w') as output:
      non_blank = (line for line in input if line.strip())
      output.writelines(non_blank)

  os.remove('C://Python/'+ending+'.csv')

  print(link1)

################# Start - Programm #################

ending = input('Bitte geben Sie den gewünschten Dateinamen ein: ')
link = 'https://www.gelbeseiten.de/'+input('Bitte geben Sie das gesuchte Gewerk ein!(ohne Umlaute)')+'/bergheim,,,,,umkreis-50000/s'

i=0
n = 4 
#n = int(input('Bitte gib die Seitenanzahl ein: '))
while i<=n :
  link1 = link+str(1+i)
  print('Link: '+link1)
  scraping_yellow(link1)
  i = i + 1
else:
  #to confirm printing    
  print('>>>> csv-Datei wurde beschrieben! <<<<')

当我使用输入时,我发现了另一个问题:1)'Test2'和2)'Fliesenleger'。由于某些数字未在源页面上注册,因此将其余数字分配错误(例如,phonenumbers不再与公司名称匹配)

我将不胜感激。

0 个答案:

没有答案