Question

我有一个python脚本，它从日志文件中提取唯一的IP地址，并显示这些IP被ping的次数，代码如下。

import sys

def extract_ip(line):
    return line.split()[0]

def increase_count(ip_dict, ip_addr):
    if ip_addr in ip_dict:
       ip_dict[ip_addr] += 1
    else:
       ip_dict[ip_addr] = 1

def read_ips(infilename):
    res_dict = {}
    log_file = file(infilename)
    for line in log_file:
        if line.isspace():
           continue
        ip_addr = extract_ip(line)
        increase_count(res_dict, ip_addr)
    return res_dict

def write_ips(outfilename, ip_dict):
    out_file = file(outfilename, "w")
    for ip_addr, count in ip_dict.iteritems():
        out_file.write("%5d\t%s\n" % (count, ip_addr))
    out_file.close()

def parse_cmd_line_args():
    if len(sys.argv)!=3:
       print("Usage: %s [infilename] [outfilename]" % sys.argv[0])
       sys.exit(1)
    return sys.argv[1], sys.argv[2]

def main():
    infilename, outfilename = parse_cmd_line_args()
    ip_dict = read_ips(infilename)
    write_ips(outfilename, ip_dict)

if __name__ == "__main__":
    main()

日志文件采用以下格式，包含2L行。这些是日志文件的前30行

220.227.40.118 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
220.227.40.118 - - [06/Mar/2012:00:00:00 -0800] "GET /hrefadd.xml HTTP/1.1" 204 214 - -
59.95.13.217 - - [06/Mar/2012:00:00:00 -0800] "GET /dbupdates2.xml HTTP/1.1" 404 0 - -
111.92.9.222 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
120.56.236.46 - - [06/Mar/2012:00:00:00 -0800] "GET /hrefadd.xml HTTP/1.1" 204 214 - -
49.138.106.21 - - [06/Mar/2012:00:00:00 -0800] "GET /add.txt HTTP/1.1" 204 214 - -
117.195.185.130 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
122.160.166.220 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
117.214.20.28 - - [06/Mar/2012:00:00:00 -0800] "GET /welcome.html HTTP/1.1" 204 212 - -
117.18.231.5 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
117.18.231.5 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
122.169.136.211 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
203.217.145.10 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
117.18.231.5 - - [06/Mar/2012:00:00:00 -0800] "GET /hrefadd.xml HTTP/1.1" 204 214 - -
59.95.13.217 - - [06/Mar/2012:00:00:00 -0800] "GET /dbupdates2.xml HTTP/1.1" 404 0 - -
203.217.145.10 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
117.206.70.4 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
117.214.20.28 - - [06/Mar/2012:00:00:00 -0800] "GET /css/epic.css HTTP/1.1" 204 214 "http://www.epicbrowser.com/welcome.html" -
117.206.70.4 - - [06/Mar/2012:00:00:00 -0800] "GET /add.txt HTTP/1.1" 204 214 - -
117.206.70.4 - - [06/Mar/2012:00:00:00 -0800] "GET /hrefadd.xml HTTP/1.1" 204 214 - -
118.97.38.130 - - [06/Mar/2012:00:00:00 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
117.214.20.28 - - [06/Mar/2012:00:00:00 -0800] "GET /js/flash_detect_min.js HTTP/1.1" 304 0 "http://www.epicbrowser.com/welcome.html" -
117.214.20.28 - - [06/Mar/2012:00:00:00 -0800] "GET /images/home-page-bottom.jpg HTTP/1.1" 304 0 "http://www.epicbrowser.com/welcome.html" -
117.214.20.28 - - [06/Mar/2012:00:00:00 -0800] "GET /images/Facebook_Like.png HTTP/1.1" 204 214 "http://www.epicbrowser.com/welcome.html" -
117.214.20.28 - - [06/Mar/2012:00:00:00 -0800] "GET /images/Twitter_Follow.png HTTP/1.1" 204 214 "http://www.epicbrowser.com/welcome.html" -
117.214.20.28 - - [06/Mar/2012:00:00:00 -0800] "GET /images/home-page-top.jpg HTTP/1.1" 304 0 "http://www.epicbrowser.com/welcome.html" -
49.138.106.21 - - [06/Mar/2012:00:00:01 -0800] "GET /dbupdates2.xml HTTP/1.1" 404 0 - -
117.18.231.5 - - [06/Mar/2012:00:00:01 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
117.18.231.5 - - [06/Mar/2012:00:00:01 -0800] "GET /hrefadd.xml HTTP/1.1" 204 214 - -
120.61.182.186 - - [06/Mar/2012:00:00:01 -0800] "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -

文件的输出格式如下

    Number of Times      IPS
     158            111.92.9.222
     11             58.97.187.231
     30             212.57.209.41
     5              119.235.51.66
     3              122.168.134.106
     5              180.234.220.75
     13             115.252.223.243

这里的ip 111.92.9.222 - [06 / Mar / 2012：00：00：00 -0800]“GET /mysidebars/newtab.html HTTP / 1.1”404 0 - - 完全被分成史诗158次。

现在我想为代码添加一项功能，这样如果我传递一个特定的URL，它应该返回通过哪个IP地址（来自日志文件或输出文件的IP地址）访问URL的次数。 / p>

E.g。如果我将网址作为输入传递：http://www.epicbrowser.com/hrefadd.xml

输出应采用以下格式

     10.10.128.134        4
     10.134.222.232       6

Answer 1

我假设您要求只需要一个给定URL的IP就行了。在这种情况下，您只需要为程序添加一个额外的过滤器，过滤掉不需要的行。程序的结构可以保持不变。

由于日志文件对主机一无所知，因此您只需指定URL的路径部分作为第三个参数;例如：“/ hrefadd.xml”

#!/usr/bin/env python
# 
# Counts the IP addresses of a log file.
# 
# Assumption: the IP address is logged in the first column.
# Example line: 117.195.185.130 - - [06/Mar/2012:00:00:00 -0800] \
#    "GET /mysidebars/newtab.html HTTP/1.1" 404 0 - -
#

import sys

def urlcheck(line, url):
    '''Checks if the url is part of the log line.'''
    lsplit = line.split()
    if len(lsplit)<7:
        return False
    return url==lsplit[6]

def extract_ip(line):
    '''Extracts the IP address from the line.
       Currently it is assumed, that the IP address is logged in
       the first column and the columns are space separated.'''
    return line.split()[0]

def increase_count(ip_dict, ip_addr):
    '''Increases the count of the IP address.
       If an IP address is not in the given dictionary,
       it is initially created and the count is set to 1.'''
    if ip_addr in ip_dict:
        ip_dict[ip_addr] += 1
    else:
        ip_dict[ip_addr] = 1

def read_ips(infilename, url):
    '''Read the IP addresses from the file and store (count)
       them in a dictionary - returns the dictionary.'''
    res_dict = {}
    log_file = file(infilename)
    for line in log_file:
        if line.isspace():
            continue
        if not urlcheck(line, url):
            continue
        ip_addr = extract_ip(line)
        increase_count(res_dict, ip_addr)
    return res_dict

def write_ips(outfilename, ip_dict):
    '''Write out the count and the IP addresses.'''
    out_file = file(outfilename, "w")
    for ip_addr, count in ip_dict.iteritems():
        out_file.write("%s\t%5d\n" % (ip_addr, count))
    out_file.close()

def parse_cmd_line_args():
    '''Return the in and out file name.
       If there are more or less than two parameters,
       an error is logged in the program is exited.'''
    if len(sys.argv)!=4:
        print("Usage: %s [infilename] [outfilename] [url]" % sys.argv[0])
        sys.exit(1)
    return sys.argv[1], sys.argv[2], sys.argv[3]

def main():
    infilename, outfilename, url = parse_cmd_line_args()
    ip_dict = read_ips(infilename, url)
    write_ips(outfilename, ip_dict)

if __name__ == "__main__":
    main()

恕我直言，如果原始的post被引用也会有所帮助。

恕我直言，你应该留下评论。

Answer 2

您的问题迫切需要使用关系数据库。

使用数据库可以构建一些查询，例如“我从每个URL获得了多少次点击？”像SELECT ip, COUNT(ip) as hits FROM requests GROUP BY ip之类的SQL查询。然后，数据库将负责循环数据并对事物进行计数。

使用下面给出的内存中SQLite数据库的完整解决方案。我测试了这个并且它有效。 'logfile.txt'应该是您在上面示例中提供的格式的文件。

编辑：修改为使用不精确指定的数据格式 - 现在唯一的要求是每行必须包含至少七个以空格分隔的字段，其中第一个字段必须是点分四格式的IP，第七个字段field必须是以'/'开头的路径。

（注意使用防御性编程技术 - 检查您获得的数据是否与您期望的样式相同，如果数据格式错误则会引发错误。这可以防止错误数据导致您的整个程序后来爆炸了。）

import os, sqlite3, re

fh = open('logfile.txt','r')

db = sqlite3.connect(':memory:') #create temporary SQLite database in memory

db.execute("""
CREATE TABLE requests (
    ip TEXT,
    url TEXT
)
""")

for line in fh:
    line_split = line.split()
    if len(line_split) < 7:
        raise ValueError ("Not enough fields - need at least seven.")

    ip = line_split[0]
    url = line_split[6]

    # Check that the 'ip' variable really contains four sets of number separated by dots.
    if (re.match(r'\d+\.\d+\.\d+\.\d+', ip) == None):
        errmsg = "The value %s found in the first column was not an IP address." % ip
        raise ValueError (errmsg)

    # check that the 'url' variable contains a string starting with /
    if (url.startswith("/") == False):
        errmsg = "The value %s found in the 7th column was not a path beginning with /" % url
        raise ValueError ( errmsg )


    #if len(line_split) != 12:
    #    print (line_split)
    #    raise ValueError("Malformatted line - must have 10 fields")
    db.execute("INSERT INTO requests VALUES (?,?)",(ip,url) )

db.commit() #save data

# print what's in the database
print("\nData in the database\n")
results = db.execute("SELECT * FROM requests")
for row in results:
    print row

# Count hits from each IP
print ("\nNumber of hits from each IP\n")
results = db.execute("""
SELECT ip, COUNT(ip) AS hits
FROM requests
GROUP BY ip""")
for row in results:
    print(row)

# Count hits from each IP for the particular URL '/mysidebars/newtab.html'
print("\nNumber of hits from each IP for url %s" % url)
target_url = '/mysidebars/newtab.html'
results = db.execute("""
SELECT ip, COUNT(ip) AS hits
FROM requests
WHERE url=?
GROUP BY ip
""", [target_url])
for row in results:
    print(row)

输出结果为：

Data in the database

(u'220.227.40.118', u'/mysidebars/newtab.html')
(u'220.227.40.118', u'/hrefadd.xml')
(u'59.95.13.217', u'/dbupdates2.xml')
(u'111.92.9.222', u'/mysidebars/newtab.html')
(u'120.56.236.46', u'/hrefadd.xml')
(u'49.138.106.21', u'/add.txt')
(u'117.195.185.130', u'/mysidebars/newtab.html')
(u'122.160.166.220', u'/mysidebars/newtab.html')
(u'117.214.20.28', u'/welcome.html')
(u'117.18.231.5', u'/mysidebars/newtab.html')
(u'117.18.231.5', u'/mysidebars/newtab.html')
(u'122.169.136.211', u'/mysidebars/newtab.html')
(u'203.217.145.10', u'/mysidebars/newtab.html')
(u'117.18.231.5', u'/hrefadd.xml')
(u'59.95.13.217', u'/dbupdates2.xml')
(u'203.217.145.10', u'/mysidebars/newtab.html')
(u'117.206.70.4', u'/mysidebars/newtab.html')
(u'117.214.20.28', u'/css/epic.css')
(u'117.206.70.4', u'/add.txt')
(u'117.206.70.4', u'/hrefadd.xml')
(u'118.97.38.130', u'/mysidebars/newtab.html')
(u'117.214.20.28', u'/js/flash_detect_min.js')
(u'117.214.20.28', u'/images/home-page-bottom.jpg')
(u'117.214.20.28', u'/images/Facebook_Like.png')
(u'117.214.20.28', u'/images/Twitter_Follow.png')
(u'117.214.20.28', u'/images/home-page-top.jpg')
(u'49.138.106.21', u'/dbupdates2.xml')
(u'117.18.231.5', u'/mysidebars/newtab.html')
(u'117.18.231.5', u'/hrefadd.xml')
(u'120.61.182.186', u'/mysidebars/newtab.html')

Number of hits from each IP

(u'111.92.9.222', 1)
(u'117.18.231.5', 5)
(u'117.195.185.130', 1)
(u'117.206.70.4', 3)
(u'117.214.20.28', 7)
(u'118.97.38.130', 1)
(u'120.56.236.46', 1)
(u'120.61.182.186', 1)
(u'122.160.166.220', 1)
(u'122.169.136.211', 1)
(u'203.217.145.10', 2)
(u'220.227.40.118', 2)
(u'49.138.106.21', 2)
(u'59.95.13.217', 2)

Number of hits from each IP for url /mysidebars/newtab.html
(u'111.92.9.222', 1)
(u'117.18.231.5', 3)
(u'117.195.185.130', 1)
(u'117.206.70.4', 1)
(u'118.97.38.130', 1)
(u'120.61.182.186', 1)
(u'122.160.166.220', 1)
(u'122.169.136.211', 1)
(u'203.217.145.10', 2)
(u'220.227.40.118', 1)

旁注：您现有的代码不是解决问题的好方法（SQL是处理“表格”数据的更好方法。）但是如果您需要计算重复的次数用于其他目的的值collections.Counter from the standard library比increase_count()函数更易于使用且速度更快。

Answer 3

您可以使用字典词典，而不是使用数据库（从长远来看可能是更好的解决方案）。

urls = {}

def increase_path_count(dict, path, ip_addr):
    if path not in dict:
        dict[path] = {}
   increase_count(dict[path], ip_addr)

修改

您必须解析日志文件的实际内容才能获取路径。这可以使用regular expression模块完成。一个好的正则表达式可能是这样的：

'GET (?P<path>/[\w.]+)'

由于日志文件中只有路径，因此需要从命令行参数中的URL中提取路径。这可以使用urlparse模块完成。

修改2

import re # .... def read_ips_and_paths(infilename, url): '''Read the IP addresses and paths from the file and store (count) them in a dictionary - returns the dictionary.''' res_dict = {} log_file = file(infilename) for line in log_file: if line.isspace(): continue # Get the ip address for the log entry ip_addr = extract_ip(line) # Get the path from the log entry match = re.search('GET (?P<path>/[\w.]+)', line); path = match.group('path') increase_path_count(res_dict, path, ip_addr) return res_dict

现在，当您想要获取特定路径的所有IP地址和计数时，可以使用urlparse来获取从命令行提供的URL的路径部分：

from urlparse import urlparse # .... url_path = urlparse(complete_url).path

不是您使用路径打印请求的数据：

for i in url_dict[url_path].items(): print "ip address: %r - %d" % (i[0], i[1])

如何查找特定IP被ping到网址的次数？

3 个答案: