在Python套接字中缓存HTTP GET REQUEST

时间:2014-12-07 12:00:19

标签: python sockets http server

我正在使用套接字创建代理服务器。当请求的文件不在我当前的目录(缓存)中时,我对源服务器(即www)进行了http get请求,并将其缓存以供日后使用。

我的代码存在的问题是,每当我从www获取资源时,我都会缓存它,但文件的内容始终是"永久移动"。

所以这就是:用户请求" stackoverlflow.com"输入" localhost:8080 / stackoverflow.com"进入浏览器。浏览器将正确返回页面。当用户输入" localhost:8080 / stackoverflow.com"在浏览器中第二次,浏览器将返回一个页面,说stackoverflow.com已永久移动。

以下是执行http get请求和缓存的方法的代码:

    @staticmethod
    def find_on_www(conn, requested_file):
        try:
            # Create a socket on the proxy server
            print 'Creating socket on proxy server'
            c = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

            host_name = requested_file.replace("www.","",1)
            print 'Host Name: ', host_name

            # Connect to the socket to port 80
            c.connect((host_name, 80))
            print 'Socket connected to port 80 of the host'

            # Create a temporary file on this socket and ask port 80
            # for the file requested by the client
            file_object = c.makefile('r', 0)
            file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n")

            # Read the response into buffer
            buff = file_object.readlines()

            # Create a new file in the cache for the requested file.
            # Also send the response in the buffer to client socket
            # and the corresponding file in the cache
            temp_file = open("./" + requested_file, "wb")
            for i in range(0, len(buff)):
                temp_file.write(buff[i])
                conn.send(buff[i])

            conn.close()

如果有人感兴趣的话,这是我的其余代码:

import socket       # Socket programming
import signal       # To shut down server on ctrl+c
import time         # Current time
import os           # To get the last-modified
import mimetypes    # To guess the type of requested file
import sys          # To exit the program
from threading import Thread


def generate_header_lines(code, modified, length, mimetype):
        """ Generates the header lines for the response message """
        h = ''

        if code == 200:
            # Append status code
            h = 'HTTP/1.1 200 OK\n'
            # Append the date

            # Append the name of the server
            h += 'Server: Proxy-Server-Thomas\n'
            # Append the date of the last modification to the file
            h += 'Last-Modified: ' + modified + '\n'

        elif code == 404:
            # Append the status code
            h = 'HTTP/1.1 404 Not Found\n'
            # Append the date
            h += 'Date: ' + time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) + '\n'
            # Append the name of the web server
            h += 'Server: Web-Server-Thomas\n'

        # Append the length of the content
        h += 'Content-Length: ' + str(length) + '\n'
        # Append the type of the content
        h += 'Content-Type: ' + mimetype + '\n'
        # Append the connection closed - let the client know we close the connection
        h += 'Connection: close\n\n'

        return h


def get_mime_type(requested_file):
    # Get the file's mimetype and encoding
    try:
        (mimetype, encoding) = mimetypes.guess_type(requested_file, True)
        if not mimetype:
            print "Mimetype found: text/html"
            return 'text/html'
        else:
            print "Mimetype found: ", mimetype
            return mimetype

    except TypeError:
        print "Mimetype found: text/html"
        return 'text/html'


class WebServer:
    def __init__(self):
        """
        Constructor
        :return:
        """
        self.host = ''      # Host for the server
        self.port = 8000    # Port for the server

        # Create socket
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    def start_server(self):
        """ Starts the server
        :return:
        """
        # Bind the socket to the host and port
        self.socket.bind((self.host, self.port))

        print "Connection started on ", self.port

        # Start the main loop of the server - start handling clients
        self.main_loop()

    @staticmethod
    def shutdown():
        """ Shuts down the server """
        try:
            s.socket.close()
        except Exception as e:
            print "Something went wrong closing the socket: ", e

    def main_loop(self):
        """Main loop of the server"""
        while True:
            # Start listening
            self.socket.listen(1)

            # Wait for a client to connect
            client_socket, client_address = self.socket.accept()

            # Wait for a request from the client
            data = client_socket.recv(1024)

            t = Thread(target=self.handle_request, args=(client_socket, data))
            t.start()

            # # Handle the request from the client
            # self.handle_request(client_socket, data)

    def handle_request(self, conn, data):
        """ Handles a request from the client """
        # Decode the data
        string = bytes.decode(data)

        # Split the request
        requested_file = string.split(' ')
        # Get the method that is requested
        request_method = requested_file[0]

        if request_method == 'GET':
            # Get the part of the request that contains the name
            requested_file = requested_file[1]
            # Get the name of the file from the request
            requested_file = requested_file[1:]

            print "Searching for: ", requested_file

            try:
                # Open the file
                file_handler = open(requested_file, 'rb')
                # Get the content of the file
                response_content = file_handler.read()
                # Close the handler
                file_handler.close()

                # Get information about the file from the OS
                file_info = os.stat(requested_file)
                # Extract the last modified time from the information
                time_modified = time.ctime(file_info[8])
                # Get the time modified in seconds
                modified_seconds = os.path.getctime(requested_file)

                print "Current time: ", time.time()
                print "Modified: ", time_modified

                if (float(time.time()) - float(modified_seconds)) > 120:  # more than 2 minutes
                    print "Time outdated!"
                    #self.find_on_www(conn, requested_file)

                # Get the file's mimetype and encoding
                mimetype = get_mime_type(requested_file)

                print "Mimetype = ", mimetype

                # Create the correct header lines
                response_headers = generate_header_lines(200, time_modified, len(response_content), mimetype)

                # Create the response to the request
                server_response = response_headers.encode() + response_content

                # Send the response back to the client
                conn.send(server_response)

                # Close the connection
                conn.close()

            except IOError:  # Couldn't find the file in the cache - Go find file on www
                print "Error: " + requested_file + " not found in cache!"
                self.find_on_www(conn, requested_file)

    @staticmethod
    def find_on_www(conn, requested_file):
        try:
            # Create a socket on the proxy server
            print 'Creating socket on proxy server'
            c = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

            host_name = requested_file.replace("www.","",1)
            print 'Host Name: ', host_name

            # Connect to the socket to port 80
            c.connect((host_name, 80))
            print 'Socket connected to port 80 of the host'

            # Create a temporary file on this socket and ask port 80
            # for the file requested by the client
            file_object = c.makefile('r', 0)
            file_object.write("GET " + "http://" + requested_file + " HTTP/1.0\n\n")

            # Read the response into buffer
            buff = file_object.readlines()

            # Create a new file in the cache for the requested file.
            # Also send the response in the buffer to client socket
            # and the corresponding file in the cache
            temp_file = open("./" + requested_file, "wb")
            for i in range(0, len(buff)):
                temp_file.write(buff[i])
                conn.send(buff[i])

            conn.close()

        except Exception as e:
            # Generate a body for the file - so we don't have an empty page
            response_content = "<html><body><p>Error 404: File not found</p></body></html>"

            # Generate the correct header lines
            response_headers = generate_header_lines(404, '', len(response_content), 'text/html')

             # Create the response to the request
            server_response = response_headers.encode() + response_content

            # Send the response back to the client
            conn.send(server_response)

            # Close the connection
            conn.close()


def shutdown_server(sig, dummy):
    """ Shuts down the server """

    # Shutdown the server
    s.shutdown()

    # exit the program
    sys.exit(1)

# Shut down on ctrl+c
signal.signal(signal.SIGINT, shutdown_server)

# Create a web server
s = WebServer()
# Start the server
s.start_server()

1 个答案:

答案 0 :(得分:1)

您的代码存在的问题是,如果您转到带有该代码的页面,则返回状态代码301页面移动,它会将其添加到标题中。当您查看未存储在缓存中的页面时,您将代理服务器直接发送给客户端的GET请求。这将通知客户端发出另一个GET请求,忽略您的代理服务器。

第二次尝试通过代理服务器请求页面时,它会从缓存中检索先前的请求。此文件包含上一个请求中正确包含重定向状态代码的标头,然后您将自己的状态代码200 ok添加到返回的消息中。当客户端首先读取此状态代码时,它没有意识到您希望它再次请求查找已重定向的页面。因此,它只显示告诉您页面已移动的页面。

当代理服务器必须查看互联网上的实际页面时,您需要做的是解析Web服务器返回的标头。然后根据这些服务器将正确的标头返回给客户端。