我的任务需要从印度2011年人口普查中提取数据。我正在使用Selenium并有一个工作脚本(如下所示),但我正在尝试使用joblib库和Parallel来并行化任务。我运行此脚本时没有收到错误,我确实在我的任务管理器(Windows 10)中看到我的处理器处于活动状态,但我没有看到运行此程序时保存的任何文件,并且在非并行版本已经完成。任何帮助将非常感激。非常感谢。 BTW,here是该程序的输入数据集的链接。
的前四个记录
import time
import re
import string
import urllib.parse
import pandas
import numpy
import os
import csv
import joblib
from selenium import webdriver
from joblib import Parallel, delayed
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
path = 'C:/Users/d.wm.mclaughlin/Dropbox/research/india'
os.chdir(path)
input_df = pandas.read_excel("file_path/villages_3109_UTTAR PRADESH_12_003.xlsx", "Sheet1")
def downloadFunction(x):
driver = webdriver.PhantomJS('C:/phantomjs/bin/phantomjs.exe')
url = "url"
driver.get(url);
selected_state = str(input_df['state_no'][x])
selected_district = str(input_df['dist_no'][x])
selected_block = str(input_df['block_no'][x]).zfill(3)
selected_pan = str(input_df['pan'][x]).zfill(4)
selected_state_name = input_df['state'][x]
selected_dist_name = input_df['district'][x]
selected_block_name = input_df['block'][x]
selected_pan_name = input_df['village'][x]
select = Select(driver.find_element_by_css_selector("#ddl_state"))
select.select_by_value(selected_state)
distSelect = Select(driver.find_element_by_css_selector("#ddl_dist"))
distSelect.select_by_value(selected_district)
blkSelect = Select(driver.find_element_by_css_selector("#ddl_blk"))
blkSelect.select_by_value(selected_block)
panSelect = Select(driver.find_element_by_css_selector("#ddl_pan"))
panSelect.select_by_value(selected_pan)
button_list = ['#RadioButtonList1_0', '#RadioButtonList1_1', '#RadioButtonList1_2']
button_names = ['auto_inclusion', 'auto_exclusion', 'other']
for b in range(0,1):
selected_button = button_list[b]
selected_button_name = button_names[b]
driver.find_element_by_css_selector(selected_button).click()
driver.find_element_by_css_selector('#Button1').click()
if('No Record Found !!!' in driver.page_source):
print('No Record Found !!!')
else:
ae = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(1)').text
if(ae == ''): ae = 0
ai = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(2)').text
if(ai == ''): ai = 0
oth = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(3)').text
if(oth == ''): oth = 0
dep = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(4)').text
if(dep == ''): dep = 0
ae = int(ae)
ai = int(ai)
oth = int(oth)
dep = int(dep)
ai_dep = ai + dep
records = [ai_dep, ae, oth]
selected_record = records[b]
table_number = round(selected_record/45)
table_numbers = list(range(1, (1+(table_number)*3), 3))
data = []
for data_tab in table_numbers:
table_address = '#Div1 > table:nth-child(' + str(data_tab) + ')'
#print(table_address)
for tr in driver.find_elements_by_css_selector(table_address):
# CONTINUE FROM HERE!!!
#print(tr == driver.find_element_by_css_selector("#Div1 > table:nth-child(" + str(data_tab) + ") > tbody > tr:nth-child(1)"))
#"#Div1 > table:nth-child(" + str(data_tab) + ") > tbody > tr:nth-child(2)"
#"#Div1 > table:nth-child(" + str(data_tab) + ") > tbody > tr:nth-child(3)"
tds = tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
#newArray = numpy.array(data)
for listItem in range(0,len(data)):
if(listItem > 0):
data[listItem] = data[listItem][18:len(data[listItem])]
#print(len(data[listItem]))
flat_data = [item for sublist in data for item in sublist]
newArray = numpy.array(flat_data)
dataRows = int(numpy.array(flat_data).size / 9)
rowsTimesColumns = (dataRows * 9)
test = pandas.DataFrame(newArray.reshape(dataRows,9), columns=['no', 'hh_name', 'gender', 'age', 'sc', 'fm_name', 'depriv_count', 'ai_d_code', 'total_mem'])
file_path = 'C:/Users/d.wm.mclaughlin/Dropbox/research/lpg_india/data/secc/secc' + '_' + selected_state + '_' + '_' + selected_district + '_' + '_' + selected_block + '_' + '_' + selected_pan + '_' + '_' + selected_button_name + '.xlsx'
test.to_excel(file_path, 'Sheet1')
return print(x);
tester = Parallel(n_jobs=3)(delayed(downloadFunction)(in_val) for in_val in range(1, 10))
答案 0 :(得分:0)
假设你有足够的内存来运行它而不使用swap,你应该看看文档。来自https://pythonhosted.org/joblib/parallel.html。特别注意最后一行。
警告强>
在Windows下,保护代码的主循环非常重要 避免在使用joblib.Parallel时递归生成子进程。 换句话说,您应该编写如下代码:
import ....
def function1(...):
...
def function2(...):
...
... if __name__ == '__main__':
# do stuff with imports and functions defined about
...
没有代码应该在“if 名称 =='主要'”块之外运行, 只有进口和定义。
如果是内存问题,请阅读本页的其余部分。你可以从
开始from joblib.pool import has_shareable_memory
并将您的最后一行更改为:
if __name__ == '__main__':
tester = Parallel(n_jobs=3, max_nbytes=1e2)(delayed(downloadFunction, has_shareable_memory)(in_val) for in_val in range(1, 10))
但我猜你的内存消耗量不会太多。
您还可以添加一些垃圾收集来节省内存:
import gc
在返回语句之前删除所有不必要的变量并添加
del driver
del test
del newArray
del data
# and all the rest
_ = gc.collect()
但请注意,这不会垃圾收集底层可执行文件内存,例如PhantomJS