如何让这个python程序更快地读取大文本文件?我的代码花了将近五分钟来读取文本文件,但我需要它来更快地完成这项工作。我认为我的算法不在O(n)中。
一些示例数据(actual data是470K +行):
frameId:0
我的代码:
webNavigation.onBeforeNavigate -> arg[0]= {"frameId":0,"parentFrameId":-1,"processId":-1,"tabId":411,"timeStamp":1500401223978.314,"url":"http://www.example.com/"}
webRequest.onBeforeRequest -> arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestId":"260870","tabId":411,"timeStamp":1500401223979.044,"type":"main_frame","url":"http://www.example.com/"}
webRequest.onBeforeSendHeaders -> arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestHeaders":[{"name":"Upgrade-Insecure-Requests","value":"1"},{"name":"User-Agent","value":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"},{"name":"Accept","value":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"},{"name":"Accept-Encoding","value":"gzip, deflate"},{"name":"Accept-Language","value":"en-US,en;q=0.8"}],"requestId":"260870","tabId":411,"timeStamp":1500401223979.3242,"type":"main_frame","url":"http://www.example.com/"}
webRequest.onSendHeaders -> arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestHeaders":[{"name":"Upgrade-Insecure-Requests","value":"1"},{"name":"User-Agent","value":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"},{"name":"Accept","value":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"},{"name":"Accept-Encoding","value":"gzip, deflate"},{"name":"Accept-Language","value":"en-US,en;q=0.8"}],"requestId":"260870","tabId":411,"timeStamp":1500401223979.538,"type":"main_frame","url":"http://www.example.com/"}
webRequest.onHeadersReceived -> arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestId":"260870","responseHeaders":[{"name":"Content-Encoding","value":"gzip"},{"name":"Accept-Ranges","value":"bytes"},{"name":"Cache-Control","value":"max-age=604800"},{"name":"Content-Type","value":"text/html"},{"name":"Date","value":"Tue, 18 Jul 2017 18:07:03 GMT"},{"name":"Etag","value":"\"359670651\""},{"name":"Expires","value":"Tue, 25 Jul 2017 18:07:03 GMT"},{"name":"Last-Modified","value":"Fri, 09 Aug 2013 23:54:35 GMT"},{"name":"Server","value":"ECS (rhv/818F)"},{"name":"Vary","value":"Accept-Encoding"},{"name":"X-Cache","value":"HIT"},{"name":"Content-Length","value":"606"}],"statusCode":200,"statusLine":"HTTP/1.1 200 OK","tabId":411,"timeStamp":1500401224072.296,"type":"main_frame","url":"http://www.example.com/"}
---^^^^^^^^^^^^^^^^^^^^^^^^^-Earliest tabs.executeScript() injection is in the handler for the webRequest.onHeadersReceived event.
webRequest.onResponseStarted -> arg[0]= {"frameId":0,"fromCache":false,"ip":"93.184.216.34","method":"GET","parentFrameId":-1,"requestId":"260870","responseHeaders":[{"name":"Content-Encoding","value":"gzip"},{"name":"Accept-Ranges","value":"bytes"},{"name":"Cache-Control","value":"max-age=604800"},{"name":"Content-Type","value":"text/html"},{"name":"Date","value":"Tue, 18 Jul 2017 18:07:03 GMT"},{"name":"Etag","value":"\"359670651\""},{"name":"Expires","value":"Tue, 25 Jul 2017 18:07:03 GMT"},{"name":"Last-Modified","value":"Fri, 09 Aug 2013 23:54:35 GMT"},{"name":"Server","value":"ECS (rhv/818F)"},{"name":"Vary","value":"Accept-Encoding"},{"name":"X-Cache","value":"HIT"},{"name":"Content-Length","value":"606"}],"statusCode":200,"statusLine":"HTTP/1.1 200 OK","tabId":411,"timeStamp":1500401224072.5032,"type":"main_frame","url":"http://www.example.com/"}
webRequest.onCompleted -> arg[0]= {"frameId":0,"fromCache":false,"ip":"93.184.216.34","method":"GET","parentFrameId":-1,"requestId":"260870","responseHeaders":[{"name":"Content-Encoding","value":"gzip"},{"name":"Accept-Ranges","value":"bytes"},{"name":"Cache-Control","value":"max-age=604800"},{"name":"Content-Type","value":"text/html"},{"name":"Date","value":"Tue, 18 Jul 2017 18:07:03 GMT"},{"name":"Etag","value":"\"359670651\""},{"name":"Expires","value":"Tue, 25 Jul 2017 18:07:03 GMT"},{"name":"Last-Modified","value":"Fri, 09 Aug 2013 23:54:35 GMT"},{"name":"Server","value":"ECS (rhv/818F)"},{"name":"Vary","value":"Accept-Encoding"},{"name":"X-Cache","value":"HIT"},{"name":"Content-Length","value":"606"}],"statusCode":200,"statusLine":"HTTP/1.1 200 OK","tabId":411,"timeStamp":1500401224074.0261,"type":"main_frame","url":"http://www.example.com/"}
tabs.onUpdated -> arg[0]= 411 :: arg[1]= {"status":"loading","url":"http://www.example.com/"} :: arg[2]= {"active":true,"audible":false,"autoDiscardable":true,"discarded":false,"height":902,"highlighted":true,"id":411,"incognito":false,"index":1,"mutedInfo":{"muted":false},"pinned":false,"selected":true,"status":"loading","title":"www.example.com","url":"http://www.example.com/","width":1282,"windowId":10}
tabs.onZoomChange -> arg[0]= {"newZoomFactor":1,"oldZoomFactor":1,"tabId":411,"zoomSettings":{"mode":"automatic","scope":"per-origin"}}
webNavigation.onCommitted -> arg[0]= {"frameId":0,"processId":107,"tabId":411,"timeStamp":1500401224079.4019,"transitionQualifiers":[],"transitionType":"reload","url":"http://www.example.com/"}
--->>Here is where you can tell it's a reload --------------------------------------------------------------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^
history.onVisited -> arg[0]= {"id":"42","lastVisitTime":1500401224077.579,"title":"Example Domain","typedCount":1,"url":"http://www.example.com/","visitCount":12}
tabs.onUpdated -> arg[0]= 411 :: arg[1]= {"title":"Example Domain"} :: arg[2]= {"active":true,"audible":false,"autoDiscardable":true,"discarded":false,"height":902,"highlighted":true,"id":411,"incognito":false,"index":1,"mutedInfo":{"muted":false},"pinned":false,"selected":true,"status":"loading","title":"Example Domain","url":"http://www.example.com/","width":1282,"windowId":10}
webNavigation.onDOMContentLoaded -> arg[0]= {"frameId":0,"processId":107,"tabId":411,"timeStamp":1500401224093.404,"url":"http://www.example.com/"}
webNavigation.onCompleted -> arg[0]= {"frameId":0,"processId":107,"tabId":411,"timeStamp":1500401224094.768,"url":"http://www.example.com/"}
tabs.onUpdated -> arg[0]= 411 :: arg[1]= {"status":"complete"} :: arg[2]= {"active":true,"audible":false,"autoDiscardable":true,"discarded":false,"height":902,"highlighted":true,"id":411,"incognito":false,"index":1,"mutedInfo":{"muted":false},"pinned":false,"selected":true,"status":"complete","title":"Example Domain","url":"http://www.example.com/","width":1282,"windowId":10}
答案 0 :(得分:0)
使用 splitlines():
将单列文件读入列表def load_words():
with open("words.txt", 'r') as f:
wordlist = f.read().splitlines()
return wordlist
您可以使用timeit
:
from timeit import timeit
timeit('load_words()', setup=setup, number=3)
# Output: 0.1708553659846075 seconds
至于如何实现看起来像模糊匹配算法的东西,你可以试试 fuzzywuzzy:
# pip install fuzzywuzzy[speedup]
from fuzzywuzzy import process
wordlist = load_words()
process.extract("eabauea", wordlist, limit=10)
输出:
[('-a', 90), ('A', 90), ('A.', 90), ('a', 90), ("a'", 90),
('a-', 90), ('a.', 90), ('AB', 90), ('Ab', 90), ('ab', 90)]
如果您过滤较长的匹配项,结果会更有趣:
results = process.extract("eabauea", wordlist, limit=100)
[x for x in results if len(x[0]) > 4]
输出:
[('abaue', 83),
('Ababua', 77),
('Abatua', 77),
('Bauera', 77),
('baulea', 77),
('abattue', 71),
('abature', 71),
('ablaqueate', 71),
('bauleah', 71),
('ebauche', 71),
('habaera', 71),
('reabuse', 71),
('Sabaean', 71),
('sabaean', 71),
('Zabaean', 71),
('-acea', 68)]
但是有470K +行需要一段时间:
timeit('process.extract("eabauea", wordlist, limit=3)', setup=setup, number=3)
# Output: 384.97334043699084 seconds