python文本文件读取很慢

时间:2017-02-10 04:56:07

标签: python file pattern-matching

如何让这个python程序更快地读取大文本文件?我的代码花了将近五分钟来读取文本文件,但我需要它来更快地完成这项工作。我认为我的算法不在O(n)中。

一些示例数据(actual data是470K +行):

frameId:0

我的代码:

webNavigation.onBeforeNavigate    ->  arg[0]= {"frameId":0,"parentFrameId":-1,"processId":-1,"tabId":411,"timeStamp":1500401223978.314,"url":"http://www.example.com/"}        
webRequest.onBeforeRequest        ->  arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestId":"260870","tabId":411,"timeStamp":1500401223979.044,"type":"main_frame","url":"http://www.example.com/"}        
webRequest.onBeforeSendHeaders    ->  arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestHeaders":[{"name":"Upgrade-Insecure-Requests","value":"1"},{"name":"User-Agent","value":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"},{"name":"Accept","value":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"},{"name":"Accept-Encoding","value":"gzip, deflate"},{"name":"Accept-Language","value":"en-US,en;q=0.8"}],"requestId":"260870","tabId":411,"timeStamp":1500401223979.3242,"type":"main_frame","url":"http://www.example.com/"}        
webRequest.onSendHeaders          ->  arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestHeaders":[{"name":"Upgrade-Insecure-Requests","value":"1"},{"name":"User-Agent","value":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"},{"name":"Accept","value":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"},{"name":"Accept-Encoding","value":"gzip, deflate"},{"name":"Accept-Language","value":"en-US,en;q=0.8"}],"requestId":"260870","tabId":411,"timeStamp":1500401223979.538,"type":"main_frame","url":"http://www.example.com/"}        
webRequest.onHeadersReceived      ->  arg[0]= {"frameId":0,"method":"GET","parentFrameId":-1,"requestId":"260870","responseHeaders":[{"name":"Content-Encoding","value":"gzip"},{"name":"Accept-Ranges","value":"bytes"},{"name":"Cache-Control","value":"max-age=604800"},{"name":"Content-Type","value":"text/html"},{"name":"Date","value":"Tue, 18 Jul 2017 18:07:03 GMT"},{"name":"Etag","value":"\"359670651\""},{"name":"Expires","value":"Tue, 25 Jul 2017 18:07:03 GMT"},{"name":"Last-Modified","value":"Fri, 09 Aug 2013 23:54:35 GMT"},{"name":"Server","value":"ECS (rhv/818F)"},{"name":"Vary","value":"Accept-Encoding"},{"name":"X-Cache","value":"HIT"},{"name":"Content-Length","value":"606"}],"statusCode":200,"statusLine":"HTTP/1.1 200 OK","tabId":411,"timeStamp":1500401224072.296,"type":"main_frame","url":"http://www.example.com/"}        
---^^^^^^^^^^^^^^^^^^^^^^^^^-Earliest tabs.executeScript() injection is in the handler for the webRequest.onHeadersReceived event.
webRequest.onResponseStarted      ->  arg[0]= {"frameId":0,"fromCache":false,"ip":"93.184.216.34","method":"GET","parentFrameId":-1,"requestId":"260870","responseHeaders":[{"name":"Content-Encoding","value":"gzip"},{"name":"Accept-Ranges","value":"bytes"},{"name":"Cache-Control","value":"max-age=604800"},{"name":"Content-Type","value":"text/html"},{"name":"Date","value":"Tue, 18 Jul 2017 18:07:03 GMT"},{"name":"Etag","value":"\"359670651\""},{"name":"Expires","value":"Tue, 25 Jul 2017 18:07:03 GMT"},{"name":"Last-Modified","value":"Fri, 09 Aug 2013 23:54:35 GMT"},{"name":"Server","value":"ECS (rhv/818F)"},{"name":"Vary","value":"Accept-Encoding"},{"name":"X-Cache","value":"HIT"},{"name":"Content-Length","value":"606"}],"statusCode":200,"statusLine":"HTTP/1.1 200 OK","tabId":411,"timeStamp":1500401224072.5032,"type":"main_frame","url":"http://www.example.com/"}        
webRequest.onCompleted            ->  arg[0]= {"frameId":0,"fromCache":false,"ip":"93.184.216.34","method":"GET","parentFrameId":-1,"requestId":"260870","responseHeaders":[{"name":"Content-Encoding","value":"gzip"},{"name":"Accept-Ranges","value":"bytes"},{"name":"Cache-Control","value":"max-age=604800"},{"name":"Content-Type","value":"text/html"},{"name":"Date","value":"Tue, 18 Jul 2017 18:07:03 GMT"},{"name":"Etag","value":"\"359670651\""},{"name":"Expires","value":"Tue, 25 Jul 2017 18:07:03 GMT"},{"name":"Last-Modified","value":"Fri, 09 Aug 2013 23:54:35 GMT"},{"name":"Server","value":"ECS (rhv/818F)"},{"name":"Vary","value":"Accept-Encoding"},{"name":"X-Cache","value":"HIT"},{"name":"Content-Length","value":"606"}],"statusCode":200,"statusLine":"HTTP/1.1 200 OK","tabId":411,"timeStamp":1500401224074.0261,"type":"main_frame","url":"http://www.example.com/"}        
tabs.onUpdated                    ->  arg[0]= 411 :: arg[1]= {"status":"loading","url":"http://www.example.com/"} :: arg[2]= {"active":true,"audible":false,"autoDiscardable":true,"discarded":false,"height":902,"highlighted":true,"id":411,"incognito":false,"index":1,"mutedInfo":{"muted":false},"pinned":false,"selected":true,"status":"loading","title":"www.example.com","url":"http://www.example.com/","width":1282,"windowId":10}    
tabs.onZoomChange                 ->  arg[0]= {"newZoomFactor":1,"oldZoomFactor":1,"tabId":411,"zoomSettings":{"mode":"automatic","scope":"per-origin"}}        
webNavigation.onCommitted         ->  arg[0]= {"frameId":0,"processId":107,"tabId":411,"timeStamp":1500401224079.4019,"transitionQualifiers":[],"transitionType":"reload","url":"http://www.example.com/"}
--->>Here is where you can tell it's a reload --------------------------------------------------------------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^
history.onVisited                 ->  arg[0]= {"id":"42","lastVisitTime":1500401224077.579,"title":"Example Domain","typedCount":1,"url":"http://www.example.com/","visitCount":12}        
tabs.onUpdated                    ->  arg[0]= 411 :: arg[1]= {"title":"Example Domain"} :: arg[2]= {"active":true,"audible":false,"autoDiscardable":true,"discarded":false,"height":902,"highlighted":true,"id":411,"incognito":false,"index":1,"mutedInfo":{"muted":false},"pinned":false,"selected":true,"status":"loading","title":"Example Domain","url":"http://www.example.com/","width":1282,"windowId":10}    
webNavigation.onDOMContentLoaded  ->  arg[0]= {"frameId":0,"processId":107,"tabId":411,"timeStamp":1500401224093.404,"url":"http://www.example.com/"}        
webNavigation.onCompleted         ->  arg[0]= {"frameId":0,"processId":107,"tabId":411,"timeStamp":1500401224094.768,"url":"http://www.example.com/"}        
tabs.onUpdated                    ->  arg[0]= 411 :: arg[1]= {"status":"complete"} :: arg[2]= {"active":true,"audible":false,"autoDiscardable":true,"discarded":false,"height":902,"highlighted":true,"id":411,"incognito":false,"index":1,"mutedInfo":{"muted":false},"pinned":false,"selected":true,"status":"complete","title":"Example Domain","url":"http://www.example.com/","width":1282,"windowId":10}   

1 个答案:

答案 0 :(得分:0)

使用 splitlines()

将单列文件读入列表
def load_words():
    with open("words.txt", 'r') as f:
        wordlist = f.read().splitlines()
    return wordlist

您可以使用timeit

对其进行基准测试
from timeit import timeit

timeit('load_words()', setup=setup, number=3)
# Output: 0.1708553659846075 seconds

至于如何实现看起来像模糊匹配算法的东西,你可以试试 fuzzywuzzy

# pip install fuzzywuzzy[speedup]

from fuzzywuzzy import process

wordlist = load_words()
process.extract("eabauea", wordlist, limit=10)

输出:

[('-a', 90), ('A', 90), ('A.', 90), ('a', 90), ("a'", 90),
 ('a-', 90), ('a.', 90), ('AB', 90), ('Ab', 90), ('ab', 90)]

如果您过滤较长的匹配项,结果会更有趣:

results = process.extract("eabauea", wordlist, limit=100)
[x for x in results if len(x[0]) > 4]

输出:

 [('abaue', 83),
 ('Ababua', 77),
 ('Abatua', 77),
 ('Bauera', 77),
 ('baulea', 77),
 ('abattue', 71),
 ('abature', 71),
 ('ablaqueate', 71),
 ('bauleah', 71),
 ('ebauche', 71),
 ('habaera', 71),
 ('reabuse', 71),
 ('Sabaean', 71),
 ('sabaean', 71),
 ('Zabaean', 71),
 ('-acea', 68)]

但是有470K +行需要一段时间:

timeit('process.extract("eabauea", wordlist, limit=3)', setup=setup, number=3)
# Output: 384.97334043699084 seconds