我正在打开一个Excel电子表格,读取b列中的数据并使用该数据在页面上执行URL查找和正则表达式,然后将找到的数据保存到同一电子表格中的第3列和第4列。一切正常,但它只写了最后一行。我想因为我的for循环是如何构建的,我已经在这里搜索了很多主题但是还没有能够理解它。
import os
from openpyxl import load_workbook
import urllib
import re
from urllib2 import urlopen
directoryPath=r'C:\Python27' #The main folder
os.chdir(directoryPath)
folder_list=os.listdir(directoryPath)
for folders, sub_folders, file in os.walk(directoryPath): #Traversing the sub folders
for name in file:
if name.endswith(".xlsx"):
filename = os.path.join(folders, name)
wb=load_workbook(filename, data_only=True)
ws=wb.active
cell_range = ws['B2':'B4']
f = urllib.urlopen(cell.value)
s = f.read()
for row in cell_range: # This is iterating through rows 1-7
for cell in row: # This iterates through the columns(cells) in that row
ws.cell(row=ws._current_row, column=3).value= str(re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",s))
ws.cell(row=ws._current_row, column=4).value= str(re.findall(r"\(?(?:(?:0(?:0|11)\)?[\s-]?\(?|\+)44\)?[\s-]?\(?(?:0\)?[\s-]?\(?)?|0)(?:\d{2}\)?[\s-]?\d{4}[\s-]?\d{4}|\d{3}\)?[\s-]?\d{3}[\s-]?\d{3,4}|\d{4}\)?[\s-]?(?:\d{5}|\d{3}[\s-]?\d{3})|\d{5}\)?[\s-]?\d{4,5}|8(?:00[\s-]?11[\s-]?11|45[\s-]?46[\s-]?4\d))(?:(?:[\s-]?(?:x|ext\.?\s?|\#)\d+)?)",s))
wb.save('scraper.xlsx')
使用
进行测试的示例网址http://www.knaptoninsurance.co.uk/contact-us
http://www.sterlingsafetywear.co.uk/contact
http://www.ilu.org.uk/contact.html