Question

我编写了一个脚本来使用Python来搜索谷歌搜索结果[文本，链接，描述]。代码工作得很好，但我需要在代码中进行一些小调整，以避免谷歌分析HTTP请求模式。这是代码

        #import requests
        #import json
        #from os.path import exists
        from selenium import webdriver
        #from selenium.webdriver.support.ui import WebDriverWait
        #from selenium.common.exceptions import TimeoutException
        #from selenium.webdriver.common.keys import Keys
        #from selenium.webdriver.common.by import By
        import time
        #from lxml import html
        from scrapy import Selector as s
        #import os
        import csv
        import itertools

        lister = ['https://www.google.co.uk/search?q=MOT+in+Godmanchester&num=10',
'https://www.google.co.uk/search?q=MOT+in+Godmanchester&num=10&start=10',
'https://www.google.co.uk/search?q=MOT+in+Hanley+Grange&num=10',
'https://www.google.co.uk/search?q=MOT+in+Hanley+Grange&num=10&start=10',
'https://www.google.co.uk/search?q=MOT+in+Huntingdon&num=10',
'https://www.google.co.uk/search?q=MOT+in+Huntingdon&num=10&start=10',
'https://www.google.co.uk/search?q=MOT+in+March&num=10']
        #a = range(1,3348,1)
        #start = 0
        driver = webdriver.Firefox()
        with open("C:\Drive F data\Google\output.csv", "ab")as export:
            fieldnames = ['link','text1','text2','text3']
            writer = csv.DictWriter(export, fieldnames=fieldnames)
            writer.writeheader()
            for serial,eacher in enumerate(lister,start=1):
                link = (eacher)
                time.sleep(6)
                driver.get(link)
                time.sleep(3)
                print serial,'.'+link
                source = driver.page_source
                source1 = s(text=source,type="html")
                text1 = source1.xpath('//h3[(contains(@class, "r")) and not(contains(@style, "line-height:normal"))]//text()').extract()
                text2 = source1.xpath('//h3[(contains(@class, "r")) and not(contains(@style, "line-height:normal"))]//@href').extract()
                text3 = source1.xpath('//span[@class="st"]').extract()
                for each,each1,each2 in itertools.izip(text1,text2,text3):
                    each = each.encode('utf8')
                    each1 = each1.encode('utf8')
                    each2 = each2.encode('utf8')
                    #print each, each1, each2
                    writer.writerow({'link':link,'text1':each,'text2':each1,'text3':each2})
                #writer.writerow({'link':link,'text1':text1,'text2':text2})
            """
            r = requests.get("https://www.google.co.uk/search?q=MOT+in+Ampthill&num=10")
            source1 = html.fromstring(r.text)
            text1 = source1.xpath("//h3[@class='r']")
            print text1
    """

在第34行，我插入了3秒的延迟，但我希望此延迟为10到30之间的变量，间隔为2。范围（10,30,2）

这样当脚本执行第一个延迟时将是10然后是12然后是14然后是16＆amp;所以直到30，＆amp;在达到30之后，它应该从10开始，然后从12开始，然后是14＆amp;等等。

请参阅脚本＆amp;提供有用的建议/修改

Answer 1

为什么不在您的sleep()中添加一个随机数？谷歌可能会接受你的序列方法。

from random import randint

# ..your code..
random_int = randint(10, 30)
print('Sleeping for {} seconds'.format(random_int))
time.sleep(random_int)

现在每个请求都会睡眠一段时间，更难以检测。

Answer 2

import random
...
time.sleep(random.choice(range(12,31,2)))

Python时间间隔延迟变量

2 个答案: