我有一项家庭作业,其中包括在Python中为网页实现代理缓存服务器。这是我的实现
from socket import *
import sys
def main():
#Create a server socket, bind it to a port and start listening
tcpSerSock = socket(AF_INET, SOCK_STREAM) #Initializing socket
tcpSerSock.bind(("", 8030)) #Binding socket to port
tcpSerSock.listen(5) #Listening for page requests
while True:
#Start receiving data from the client
print 'Ready to serve...'
tcpCliSock, addr = tcpSerSock.accept()
print 'Received a connection from:', addr
message = tcpCliSock.recv(1024)
print message
#Extract the filename from the given message
filename = ""
try:
filename = message.split()[1].partition("/")[2].replace("/", "")
except:
continue
fileExist = False
try: #Check whether the file exists in the cache
f = open(filename, "r")
outputdata = f.readlines()
fileExist = True
#ProxyServer finds a cache hit and generates a response message
tcpCliSock.send("HTTP/1.0 200 OK\r\n")
tcpCliSock.send("Content-Type:text/html\r\n")
for data in outputdata:
tcpCliSock.send(data)
print 'Read from cache'
except IOError: #Error handling for file not found in cache
if fileExist == False:
c = socket(AF_INET, SOCK_STREAM) #Create a socket on the proxyserver
try:
srv = getaddrinfo(filename, 80)
c.connect((filename, 80)) #https://docs.python.org/2/library/socket.html
# Create a temporary file on this socket and ask port 80 for
# the file requested by the client
fileobj = c.makefile('r', 0)
fileobj.write("GET " + "http://" + filename + " HTTP/1.0\r\n")
# Read the response into buffer
buffr = fileobj.readlines()
# Create a new file in the cache for the requested file.
# Also send the response in the buffer to client socket and the
# corresponding file in the cache
tmpFile = open(filename,"wb")
for data in buffr:
tmpFile.write(data)
tcpCliSock.send(data)
except:
print "Illegal request"
else: #File not found
print "404: File Not Found"
tcpCliSock.close() #Close the client and the server sockets
main()
我将浏览器配置为使用我的代理服务器
但我运行它的问题是,无论我尝试访问哪个网页,它都会返回初始连接的404错误,然后连接重置错误以及后续连接。我不知道为什么这样任何帮助都会非常感谢,谢谢!
答案 0 :(得分:0)
您的代码存在很多问题。
您的网址解析器非常繁琐。而不是行
filename = message.split()[1].partition("/")[2].replace("/", "")
我会用
import re
parsed_url = re.match(r'GET\s+http://(([^/]+)(.*))\sHTTP/1.*$', message)
local_path = parsed_url.group(3)
host_name = parsed_url.group(2)
filename = parsed_url.group(1)
如果你在那里遇到异常,你应该抛出一个错误,因为这是你的代理人不理解的请求(例如POST)。
将请求汇编到目标服务器时,请使用
fileobj.write("GET {object} HTTP/1.0\n".format(object=local_path))
fileobj.write("Host: {host}\n\n".format(host=host_name))
您还应该包含原始请求中的一些标题行,因为它们可以对返回的内容产生重大影响。
此外,您当前使用所有标题行缓存整个响应,因此在从缓存提供服务时不应添加自己的响应。
无论如何,您所拥有的内容并不起作用,因为无法保证您将获得200和text/html
内容。您应该检查响应代码,如果确实获得了200,则只缓存。