我正在尝试构建使用tor代理的多线程抓取工具: 我正在使用以下来建立连接:
def get_soup(url):
while True:
try:
connectTor()
r = requests.Session()
response = r.get(url, headers=request_headers)
the_page = response.content.decode('utf-8',errors='ignore')
the_soup = BeautifulSoup(the_page, 'html.parser')
if "captcha" in the_page.lower():
print("flag condition matched while url: ", url)
#print(the_page)
renew_tor()
else:
return the_soup
break
except Exception as e:
print ("Error while URL :", url, str(e))
这是url fetcher:
with futures.ThreadPoolExecutor(200) as executor:
for url in zurls:
future = executor.submit(fetchjob,url)
然后我创建了多线程获取作业:
Socket connection failed (Socket error: 0x01: General SOCKS server failure)
然后我收到以下错误,我在使用多处理时没有看到错误:
def postprocess_image(img, in_shape):
class_image = tf.argmax(img, axis=2)
colored_class_image = utils.class_image_to_image_tensor(class_image, [HEIGHT,WIDTH])
image_expand = tf.expand_dims(colored_class_image, 0)
image_r = tf.image.resize_bilinear(image_expand, in_shape, align_corners=False)
casted_data = tf.bitcast(tf.cast(image_r[0], tf.int8), tf.uint8)
out_image = tf.image.encode_png(casted_data)
return out_image
sess = K.get_session()
g = sess.graph
g_def = graph_util.convert_variables_to_constants(sess,
g.as_graph_def(),
[model.output.name.replace(':0','')])
with tf.Graph().as_default() as g_input:
input_b64 = tf.placeholder(shape=(1,),
dtype=tf.string,
name='b64')
tf.logging.info('input b64 {}'.format(input_b64))
image = tf.image.decode_image(input_b64[0])
image_f = tf.image.convert_image_dtype(image, dtype=tf.uint8)
input_image = tf.expand_dims(image_f, 0)
image_r = tf.image.resize_bilinear(input_image, [HEIGHT, WIDTH], align_corners=False)
input_data = preprocess_image(image_r[0])
output = tf.identity(input_data, name='input_image')
with tf.Graph().as_default() as g_output:
first = tf.placeholder(shape=[1,473,473,150],
dtype=tf.float32,
name='activation_58/div')
i_shape = tf.placeholder(dtype=tf.int32, shape=[2], name='in_shape')
post_image = postprocess_image(first[0], i_shape)
output_data = tf.identity(post_image, name='out')
g_input_def = g_input.as_graph_def()
g_output_def = g_output.as_graph_def()
with tf.Graph().as_default() as g_combined:
x = tf.placeholder(tf.string, name="b64")
in_shape = tf.placeholder(tf.int32, shape=[1,2],name="original_shape")
im, = tf.import_graph_def(g_input_def,
input_map={'b64:0': x},
return_elements=["input_image:0"])
pred, = tf.import_graph_def(g_def,
input_map={model.input.name: im},
return_elements=[model.output.name])
y, = tf.import_graph_def(g_output_def,
input_map={model.output.name: pred,
'in_shape:0':in_shape[0]},
return_elements=["out:0"])
with tf.Session() as session:
inputs = {"image_bytes": tf.saved_model.utils.build_tensor_info(x),
"original_shape":tf.saved_model.utils.build_tensor_info(in_shape)}
outputs = {"output_bytes":tf.saved_model.utils.build_tensor_info(y)}
signature =tf.saved_model.signature_def_utils.build_signature_def(
inputs=inputs,
outputs=outputs,
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
)
"""Convert the Keras HDF5 model into TensorFlow SavedModel."""
if os.path.exists(export_path):
shutil.rmtree(export_path)
legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
builder = saved_model_builder.SavedModelBuilder(export_path)
builder.add_meta_graph_and_variables(
sess=session,
tags=[tag_constants.SERVING],
signature_def_map={ signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature },
)
builder.save()
我将不胜感激任何建议,以避免袜子错误和提高爬行方法的性能,使其多线程。
答案 0 :(得分:1)
这是猴子修补socket.socket
为什么不好的一个很好的例子。
这将使用SOCKS套接字替换所有 socket
连接(最重要的是)所使用的套接字。
当您稍后再连接到控制器时,它会尝试使用SOCKS协议进行通信,而不是建立直接连接。
由于您已经在使用requests
,我建议删除SocksiPy和socks.socket = socks.socksocket
代码并使用内置于请求中的SOCKS proxy功能:< / p>
proxies = {
'http': 'socks5h://127.0.0.1:9050',
'https': 'socks5h://127.0.0.1:9050'
}
response = r.get(url, headers=request_headers, proxies=proxies)