我编写了一个脚本来使用Python来搜索谷歌搜索结果[文本,链接,描述]。代码工作得很好,但我需要在代码中进行一些小调整,以避免谷歌分析HTTP请求模式。这是代码
#import requests
#import json
#from os.path import exists
from selenium import webdriver
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.common.exceptions import TimeoutException
#from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.common.by import By
import time
#from lxml import html
from scrapy import Selector as s
#import os
import csv
import itertools
lister = ['https://www.google.co.uk/search?q=MOT+in+Godmanchester&num=10',
'https://www.google.co.uk/search?q=MOT+in+Godmanchester&num=10&start=10',
'https://www.google.co.uk/search?q=MOT+in+Hanley+Grange&num=10',
'https://www.google.co.uk/search?q=MOT+in+Hanley+Grange&num=10&start=10',
'https://www.google.co.uk/search?q=MOT+in+Huntingdon&num=10',
'https://www.google.co.uk/search?q=MOT+in+Huntingdon&num=10&start=10',
'https://www.google.co.uk/search?q=MOT+in+March&num=10']
#a = range(1,3348,1)
#start = 0
driver = webdriver.Firefox()
with open("C:\Drive F data\Google\output.csv", "ab")as export:
fieldnames = ['link','text1','text2','text3']
writer = csv.DictWriter(export, fieldnames=fieldnames)
writer.writeheader()
for serial,eacher in enumerate(lister,start=1):
link = (eacher)
time.sleep(6)
driver.get(link)
time.sleep(3)
print serial,'.'+link
source = driver.page_source
source1 = s(text=source,type="html")
text1 = source1.xpath('//h3[(contains(@class, "r")) and not(contains(@style, "line-height:normal"))]//text()').extract()
text2 = source1.xpath('//h3[(contains(@class, "r")) and not(contains(@style, "line-height:normal"))]//@href').extract()
text3 = source1.xpath('//span[@class="st"]').extract()
for each,each1,each2 in itertools.izip(text1,text2,text3):
each = each.encode('utf8')
each1 = each1.encode('utf8')
each2 = each2.encode('utf8')
#print each, each1, each2
writer.writerow({'link':link,'text1':each,'text2':each1,'text3':each2})
#writer.writerow({'link':link,'text1':text1,'text2':text2})
"""
r = requests.get("https://www.google.co.uk/search?q=MOT+in+Ampthill&num=10")
source1 = html.fromstring(r.text)
text1 = source1.xpath("//h3[@class='r']")
print text1
"""
在第34行,我插入了3秒的延迟,但我希望此延迟为10到30之间的变量,间隔为2。范围(10,30,2)
这样当脚本执行第一个延迟时将是10然后是12然后是14然后是16&所以直到30,&在达到30之后,它应该从10开始,然后从12开始,然后是14&等等。
请参阅脚本&提供有用的建议/修改
答案 0 :(得分:0)
为什么不在您的sleep()
中添加一个随机数?谷歌可能会接受你的序列方法。
from random import randint
# ..your code..
random_int = randint(10, 30)
print('Sleeping for {} seconds'.format(random_int))
time.sleep(random_int)
现在每个请求都会睡眠一段时间,更难以检测。
答案 1 :(得分:0)
import random
...
time.sleep(random.choice(range(12,31,2)))