Question

仅仅因为软件是自动化的并不意味着它会遵守你的robots.txt。有哪些方法可用于检测某人何时抓取您的网站？假设您的网站有100个页面的100个，值得抓取或DDOS。

这是一个愚蠢的想法，我可能不起作用：给每个用户一个具有唯一值的cookie，并使用cookie知道有人在做第二个/第三个/等等请求。这可能不起作用，因为爬虫可能不接受cookie，因此在这个方案中，机器人看起来像每个请求的新用户。

有没有人有更好的想法？

Answer 1

您可以在页面中放置不可见或最终用户可点击的链接。很多机器人只关注所有链接。一旦有人请求其中一个链接，你几乎肯定会有一个爬虫/机器人。

Answer 2

项目Honey Pot保留了一个“坏”机器人列表。

这是我写的联系他们的网络服务的课程。你必须修改它，因为我有一些专有的库，但大多数应该是好的。有时他们的服务会发回错误，但它确实有助于减少一些不良流量。

using System;
using System.Linq;
using System.Net;
using System.Xml.Linq;
using SeaRisenLib2.Text;
using XmlLib;

/// <summary>
/// Summary description for HoneyPot
/// </summary>
public class HoneyPot
{
    private const string KEY = "blacklistkey"; // blacklist key - need to register at httpbl.org to get it
    private const string HTTPBL = "dnsbl.httpbl.org"; // blacklist lookup host
public HoneyPot()
{
}

public static Score GetScore_ByIP(string ip)
{
    string sendMsg = "", receiveMsg = "";
    int errorCount = 0; // track where in try/catch we fail for debugging
    try
    {
        // for testing: ip = "188.143.232.31";
        //ip = "173.242.116.72";
        if ("127.0.0.1" == ip) return null; // localhost development computer
        IPAddress address;
        if (!IPAddress.TryParse(ip, out address))
            throw new Exception("Invalid IP address to HoneyPot.GetScore_ByIP:" + ip);
        errorCount++; // 1
        string reverseIP = ip.ToArray('.').Reverse().ToStringCSV(".");
        sendMsg = string.Format("{0}.{1}.{2}", KEY, reverseIP, HTTPBL);
        errorCount++; // 2
        //IPHostEntry value = Dns.GetHostByName(sendMsg);
        IPHostEntry value = Dns.GetHostEntry(sendMsg);
        errorCount++; // 3
        address = value.AddressList[0];
        errorCount++; // 4
        receiveMsg = address.ToString();
        errorCount++; // 5
        int[] ipArray = receiveMsg.ToArray('.').Select(s => Convert.ToInt32(s)).ToArray();
        errorCount++; // 6
        if (127 != ipArray[0]) // error
            throw new Exception("HoneyPot error");
        errorCount++; // 7
        Score score = new Score()
        {
            DaysSinceLastSeen = ipArray[1],
            Threat = ipArray[2],
            BotType = ipArray[3]
        };
        errorCount++; // 8
        return score;
    }
    catch (Exception ex)
    {
        Log.Using("VisitorLog/HoneyPotErrors", log =>
        {
            log.SetString("IPrequest", ip);
            log.SetString("SendMsg", sendMsg, XmlFile.ELEMENT);
            log.SetString("RecvMsg", receiveMsg, XmlFile.ELEMENT);
            log.SetString("Exception", ex.Message, XmlFile.ELEMENT);
            log.SetString("ErrorCount", errorCount.ToString());
        });
    }
    return null;
}

// Bitwise values
public enum BotTypeEnum : int
{
    SearchEngine = 0,
    Suspicious = 1,
    Harvester = 2,
    CommentSpammer = 4
}

public class Score
{
    public Score()
    {
        BotType = -1;
        DaysSinceLastSeen = -1;
        Threat = -1;
    }

    public int DaysSinceLastSeen { get; internal set; }
    public int Threat { get; internal set; }
    /// <summary>
    /// Use BotTypeEnum to understand value.
    /// </summary>
    public int BotType { get; internal set; }

    /// <summary>
    /// Convert HoneyPot Score values to String (DaysSinceLastSeen.Threat.BotType)
    /// </summary>
    /// <returns></returns>
    public override string ToString()
    {
        return string.Format("{0}.{1}.{2}",
            DaysSinceLastSeen,
            Threat,
            BotType);
    }

    public static explicit operator XElement(Score score)
    {
        XElement xpot = new XElement("HoneyPot");
        if (null != score)
        {
            if (score.DaysSinceLastSeen >= 0)
                xpot.SetString("Days", score.DaysSinceLastSeen);
            if (score.Threat >= 0)
                xpot.SetString("Threat", score.Threat);
            if (score.BotType >= 0)
                xpot.SetString("Type", score.BotType);
            foreach (BotTypeEnum t in Enum.GetValues(typeof(BotTypeEnum)))
            {
                // Log enum values as string for each bitwise value represented in score.BotType
                int value = (int)t;
                if ((value == score.BotType) || ((value & score.BotType) > 0))
                    xpot.GetCategory(t.ToString());
            }
        }
        return xpot;
    }

    public static explicit operator Score(XElement xpot)
    {
        Score score = null;
        if (null != xpot)
            score = new Score()
            {
                DaysSinceLastSeen = xpot.GetInt("Days"),
                Threat = xpot.GetInt("Threat"),
                BotType = xpot.GetInt("Type")
            };
        return score;
    }
}

/// <summary>
/// Log score value to HoneyPot child Element (if score not null).
/// </summary>
/// <param name="score"></param>
/// <param name="parent"></param>
public static void LogScore(HoneyPot.Score score, XElement parent)
{
    if ((null != score) && (null != parent))
    {
        parent.Add((XElement)score);
    }
}

}

Answer 3

虽然从技术上讲，它不会“检测到”＃34; bot爬虫，我有一种有趣的方法来阻止它们。我的方法是创建一个IIS过滤器或Apache插件。你要做的是加密所有的html，asp，php等页面。唯一未加密的页面是索引页面。索引页面只是安装带有加密公钥的cookie，然后重定向到第二个索引页面。然后，IIS过滤器或Apache插件将检查每个vistor以确保它们具有此cookie。如果是，则过滤器将解密所请求的页面，然后将页面传递到Web服务器以进行处理。

此方法允许普通游客查看您的网页，但如果拒绝cookie的机器人试图读取您的网页，则会全部加密。

Answer 4

黑名单可能不是一个很好的方法，最好有一个已知机器人的白名单，允许每秒点击一定数量的点击量。如果不在该白名单上的某人每秒点击次数过多，则开始断开他们的连接几秒钟。这将有助于防止ddosing，并且仍然让未知的机器人扫描您的网站（尽管比您想象的要慢得多）。

你可以记录罪犯的日志，看看谁反复违反规则：）

我可以用什么方法来检测机器人？

4 个答案: