从python中的txt文件处理url,并以txt格式输出网页的内容

时间:2017-02-02 05:09:52

标签: python web-scraping spyder

我有一个python spider脚本,它只会丢弃url。但它只需要一个url作为输入。我有一个很大的域名输入txt文件列表,想要处理它们并将输出保存到txt文件。

这是我的python脚本

#include <pcap.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netinet/ip.h>
#include <unistd.h>

/* ethernet headers are always exactly 14 bytes [1] */
#define SIZE_ETHERNET 14

/* Ethernet addresses are 6 bytes */
#define ETHER_ADDR_LEN  6

#define PACKET_LEN   1500
#define BUFSIZE  1500

/* Ethernet header */
struct ethheader {
    u_char  ether_dhost[ETHER_ADDR_LEN];    /* destination host address */
    u_char  ether_shost[ETHER_ADDR_LEN];    /* source host address */
    u_short ether_type;                     /* IP? ARP? RARP? etc */
};


/* IP Header */
struct ipheader {
    unsigned char      iph_ihl:4, iph_ver:4; //IP Header length & Version.
    unsigned char      iph_tos; //Type of service
    unsigned short int iph_len; //IP Packet length (Both data and header)
    unsigned short int iph_ident; //Identification
    unsigned short int iph_flag:3, iph_offset:13; //Flags and Fragmentation offset
    unsigned char      iph_ttl; //Time to Live
    unsigned char      iph_protocol; //Type of the upper-level protocol
    unsigned short int iph_chksum; //IP datagram checksum
    struct  in_addr    iph_sourceip; //IP Source address (In network byte order)
    struct  in_addr    iph_destip;//IP Destination address (In network byte order)
};

/* ICMP Header */
struct icmpheader {
    unsigned char icmp_type; //ICMP message type
    unsigned char icmp_code; //Error code
    unsigned short int icmp_chksum; //Checksum for ICMP Header and data
    unsigned short int icmp_id; //Used in echo request/reply to identify request
    unsigned short int icmp_seq;//Identifies the sequence of echo messages, 
                    //if more than one is sent.
};


/* TCP Header */
struct tcpheader {
    u_short tcp_sport;               /* source port */
    u_short tcp_dport;               /* destination port */
    u_int   tcp_seq;                 /* sequence number */
    u_int   tcp_ack;                 /* acknowledgement number */
    u_char  tcp_offx2;               /* data offset, rsvd */
#define TH_OFF(th)      (((th)->tcp_offx2 & 0xf0) >> 4)
    u_char  tcp_flags;
#define TH_FIN  0x01
#define TH_SYN  0x02
#define TH_RST  0x04
#define TH_PUSH 0x08
#define TH_ACK  0x10
#define TH_URG  0x20
#define TH_ECE  0x40
#define TH_CWR  0x80
#define TH_FLAGS        (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR)
    u_short tcp_win;                 /* window */
    u_short tcp_sum;                 /* checksum */
    u_short tcp_urp;                 /* urgent pointer */
};


/* UDP Header */
struct udpheader
{
  u_int16_t udp_sport;           /* source port */
  u_int16_t udp_dport;           /* destination port */
  u_int16_t udp_ulen;            /* udp length */
  u_int16_t udp_sum;             /* udp checksum */
};

struct pseudo_tcp
{
        unsigned saddr, daddr;
        unsigned char mbz;
        unsigned char ptcl;
        unsigned short tcpl;
        struct tcpheader tcp;
        char payload[PACKET_LEN];
};

// DNS layer header's structure
struct dnsheader {
    unsigned short int query_id;
    unsigned short int flags;
    unsigned short int QDCOUNT;
    unsigned short int ANCOUNT;
    unsigned short int NSCOUNT;
    unsigned short int ARCOUNT;
};

unsigned short in_cksum(unsigned short *buf,int length)
{
        unsigned short *w = buf;
        int nleft = length;
        int sum = 0;
        unsigned short temp=0;

        /*
        * The algorithm uses a 32 bit accumulator (sum), adds
        * sequential 16 bit words to it, and at the end, folds back all the
        * carry bits from the top 16 bits into the lower 16 bits.
        */
        while (nleft > 1)  {
                sum += *w++;
                nleft -= 2;
        }

        /* treat the odd byte at the end, if any */
        if (nleft == 1) {
                *(u_char *)(&temp) = *(u_char *)w ;
                sum += temp;
        }

        /* add back carry outs from top 16 bits to low 16 bits */
        sum = (sum >> 16) + (sum & 0xffff);     // add hi 16 to low 16 
        sum += (sum >> 16);                     // add carry 
    return (unsigned short)(~sum);
}



/****************************************************************************
  TCP checksum is calculated on the pseudo header, which includes the 
  the TCP header and data, plus some part of the IP header. Therefore, 
  we need to construct the pseudo header first.
*****************************************************************************/
unsigned short calculate_tcp_checksum(struct ipheader *ip)
{
   struct tcpheader *tcp = (struct tcpheader *)((u_char *)ip + 
                               sizeof(struct ipheader)); 

   int tcp_len = ntohs(ip->iph_len) - sizeof(struct ipheader);

   /* pseudo tcp header for the checksum computation */
   struct pseudo_tcp p_tcp;
   memset(&p_tcp, 0x0, sizeof(struct pseudo_tcp));

   p_tcp.saddr  = ip->iph_sourceip.s_addr;
   p_tcp.daddr  = ip->iph_destip.s_addr;
   p_tcp.mbz    = 0;
   p_tcp.ptcl   = IPPROTO_TCP;
   p_tcp.tcpl   = htons(tcp_len);
   memcpy(&p_tcp.tcp, tcp, tcp_len);

   return  (unsigned short)in_cksum((unsigned short *)&p_tcp, tcp_len + 12); 
}




/****************************************************************************
Function to actually send the spoofed IP reply.  
*****************************************************************************/
void send_raw_packet (struct ipheader* ip)
{
int n = 0;
struct sockaddr_in dest_info;
int enable = 1;

//create a raw network socket and set its options. 

int sock = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
setsockopt(sock, IPPROTO_IP, IP_HDRINCL, &enable, sizeof(enable)); 

//provide needed information about destination

dest_info.sin_family = AF_INET;
dest_info.sin_addr = ip->iph_destip;
//dest_info.sin_addr.s_addr = ip->iph_sourceip.s_addr;

//send out the packet
printf("Attmpting to send a spoofed ICMP packet! \n");
printf("......................................\n");
/*print the source and destination IP Addresses*/
printf("            From: %s\n", inet_ntoa(ip->iph_sourceip));
printf("            To: %s\n", inet_ntoa(ip->iph_destip));

printf("......................................\n");
while (n<2){
sendto(sock, ip, ntohs(ip->iph_len), 0, (struct sockaddr *)&dest_info, sizeof(dest_info));
printf("Oooooo....a spoofed ICMP packet has successfully been sent! \n");
n++;
}
close(sock);
}






/****************************************************************************
Function designed to spoof the ICMP reply
*****************************************************************************/
void spoofReply(struct ipheader* ip)
{


int ip_header_len = ip->iph_ihl * 4;
const char buffer[BUFSIZE];

struct icmpheader* icmp = (struct icmpheader *) ((u_char *)ip + ip_header_len);
if (icmp->icmp_type != 8) //this is not a reply, this is a request
{
printf("Packet received was not an ICMP request. Nothing sent.\n");
return;
}


//copy the original packet to a buffer
memset((char*)buffer, 0, BUFSIZE);
memset((char*)buffer, (int) ip, ntohs(ip->iph_len));
struct ipheader   * newip = (struct ipheader *) buffer;    //check this line for an error!!! 
struct icmpheader * newicmp = (struct icmpheader *) ((u_char *)buffer + ip_header_len);

//new IP construction
newip->iph_sourceip.s_addr = ip->iph_destip.s_addr;
newip->iph_destip.s_addr = ip->iph_sourceip.s_addr;
newip->iph_ttl = 20;
newip->iph_protocol = IPPROTO_ICMP;

//fill ICMP info
newicmp->icmp_type = 0;

//checksum 
newicmp->icmp_chksum = 0;
newicmp->icmp_chksum = in_cksum((unsigned short *)newicmp, ntohs(ip->iph_len) - ip_header_len);
printf("Packet is for sure an ICMP Request. Lets send a raw packet!\n");
send_raw_packet(newip);
}







/****************************************************************************
Packet Handler- This function handles incoming packets and checks for ICMP
protocol. Calls spoofReply() if the packet is an ICMP Request. 
spoofed ICMP reply if the incoming packet is an ICMP request. 
*****************************************************************************/


void gotPacket (u_char *args, const struct pcap_pkthdr *header, const u_char *packet)
{
struct ethheader *eth = (struct ethheader *) packet;
if (eth->ether_type != ntohs(0x0800)) return; //this is not an IP packet

struct ipheader* ip = (struct ipheader*)(packet + SIZE_ETHERNET); 
int ip_header_len = ip->iph_ihl * 4; 

printf("......................................\n");
/*print the source and destination IP Addresses*/
printf("            From: %s\n", inet_ntoa(ip->iph_sourceip));
printf("            To: %s\n", inet_ntoa(ip->iph_destip));


if (ip->iph_protocol == IPPROTO_ICMP)
{
printf("Whoa! An ICMP packet has been found! Lets check it out.\n");
spoofReply(ip);
}

}








int main() 
{
pcap_t *handle;
char errbuf[PCAP_ERRBUF_SIZE];
struct bpf_program fp;
char filter_exp[] = ""; 
bpf_u_int32 net;


//Print name of program and its filter:
printf("Sniff...Sniff...Sniff...\n");
printf("Cole Sniffer Started. Filtering %s \n", filter_exp);

//open the live session using the pcap_open function
handle = pcap_open_live("eth13", BUFSIZ, 1, 1000, errbuf);

//set the filter for whichever type of traffic you would like to receive
pcap_compile(handle, &fp, filter_exp, 0, net);

//set filter (continued) 
pcap_setfilter(handle, &fp);
pcap_loop(handle, 100, gotPacket, NULL); // captures the packets
pcap_close(handle);
return 0; 
};

如果可能的话,也要给出详细模式。

1 个答案:

答案 0 :(得分:0)

您可以运行脚本并将输出重定向到文件:

scrapy crawl google_parser  > output.txt

如何输入线条。您可以从标准输入中读取它们:

<强> google_parser.py

import sys
from urllib.parse import urlparse
from scrapy import Spider, Request, spidermiddlewares


class MySpider(Spider):
    name = 'google_parser'
    allowed_domains = []

    def start_requests(self):
        with sys.stdin as f:
            urls = [x.strip() for x in f.readlines()]
        self.allowed_domains = [urlparse(url).hostname for url in urls]
        # Refresh the regex cache for `allowed_domains`
        # thx to - http://stackoverflow.com/questions/5161815/dynamically-add-to-allowed-domains-in-a-scrapy-spider
        for mw in self.crawler.engine.scraper.spidermw.middlewares:
            if isinstance(mw, spidermiddlewares.offsite.OffsiteMiddleware):
                mw.spider_opened(self)
        for url in urls:
                yield Request(url)

    def parse(self, response):
        for url in response.xpath('//a/@href').extract():
            new_url = response.urljoin(url)
            print(new_url)
            yield Request(new_url)

例如:

cat urls.txt | scrapy crawl google_parser

输出:

['http://www.com', 'http://www.me',]

你也可以直接输入文件:

scrapy crawl google_parser < urls.txt 

终于

scrapy crawl google_parser < urls.txt > output.txt

这种方式为您提供了非常灵活的链接程序的能力,例如获取输入列表,过滤它只保留一些URL匹配条件,先取N并传递给您的程序:

cat urls.txt | grep '/script.php?' | head -5 | scrapy crawl google_parser > output.txt