我有大约200MB的文本文件(rawtext.txt),并且在文本文件中有一个停用词列表(stopwords.txt)。
I
a
about
an
are
as
at
be
by
com
for
...
我想删除文本语料库中的停用词。但是怎么样?什么是最快最简单的方法?首选命令行,如sed或tr。不想使用python或NLTK。
有人可以帮忙吗?我使用的是Mac OSX(不是linux)
答案 0 :(得分:0)
将您的输入转换为每行字格式,您可以使用unit ServerTcpA;
interface
{
uses
SysUtils, Variants, Classes, Generics.Collections;
}
uses
System.SysUtils, System.Types, System.UITypes, System.Classes, System.Variants,
System.Generics.Collections;
{
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, ExtCtrls, StdCtrls, Contnrs, ComCtrls,
Buttons, WinSock, ScktComp;
}
type
// CLASS DECLARATIONS ----------------------------------------------------
// TClientTcp CLASS -----------------------------------------------------
TClientTcp = class (TObject)
public
Name : String;
IP : String;
Port : Integer;
RecFrames : Integer;
end;
// TClientsTcpList CLASS -------------------------------------------------
TClientsTcpList = class (TObjectList <TObject>)
private
function FGetItem (index : Integer) : TClientTcp;
public
property Items [index : Integer] : TClientTcp read FGetItem;
function Add (name : String; ip : String; port: Integer) : TClientTcp;
function FindClient_ByName (name : String) : TClientTcp;
function FindClient_ByIp (ip : String) : TClientTcp;
function FindClient_ByPort (port : Integer) : Integer;
function FindClient_ByIpPort (ip : String; port : Integer): Integer;
end;
// TTcpCfg CLASS ---------------------------------------------------------
TTcpCfg = class (TObject)
TcpClientsList : TClientsTcpList;
public
constructor Create;
destructor Destroy; override;
function AddClient (ip : String; port: Integer) : TClientTcp;
end;
// ENUM - Defined Column Names -------------------------------------------
type TColNames = (
COL_LP = 0,
COL_NAME,
COL_IP,
COL_PORT
);
const
NONE = -1;
var
//ServerTcpDK : TServerSocket;
//TCPCFG : TTcpCfg;
ClientsList : TClientsTcpList;
implementation
// =================================================== CLASS: TClientsTcpList
// GET ITEM
function TClientsTcpList.FGetItem (index : Integer) : TClientTcp;
begin
//Result := inherited GetItem (index) as TClientTcp;
Result := inherited Items [index] as TClientTcp;
end;
// ADD ITEM
function TClientsTcpList.Add (name : String; ip : String; port: Integer) : TClientTcp;
begin
if (FindClient_ByIpPort (ip, port) = NONE) then begin
Result := TClientTcp.Create;
Result.Name := name;
Result.IP := ip;
Result.Port := port;
Result.RecFrames := 0;
inherited Add (Result);
end;
end;
// FIND CLIENT: BY NAME
function TClientsTcpList.FindClient_ByName (name : String): TClientTcp;
var
i : integer;
begin
//Result := nil;
Result := nil;
for i:=0 to Count-1 do begin
if Items [i].Name = name then begin
Result := Items[i];
break;
end;
end;
end;
// FIND CLIENT: BY IP
function TClientsTcpList.FindClient_ByIp (ip : String): TClientTcp;
var
i : integer;
begin
//Result := nil;
Result := nil;
for i:=0 to Count-1 do begin
if Items [i].IP = ip then begin
Result := Items[i];
break;
end;
end;
end;
// FIND CLIENT: BY PORT ------------------------------------------------------
// @Ret: Item Index in the LIST
// -1: Not Found
function TClientsTcpList.FindClient_ByPort (port : Integer): Integer;
var
i : integer;
begin
Result := NONE;
for i:=0 to Count-1 do begin
if Items [i].Port = port then begin
Result := i;
break;
end;
end;
end;
// FIND CLIENT: BY IP AND PORT -----------------------------------------------
// @Ret: Item Index in the LIST
// -1: Not Found
function TClientsTcpList.FindClient_ByIpPort (ip : String; port : Integer): Integer;
var
i : integer;
begin
Result := NONE;
for i:=0 to Count-1 do begin
if (Items [i].IP = ip) and (Items [i].Port = port) then begin
Result := i;
break;
end;
end;
end;
// =========================================================== CLASS: TTcpCfg
constructor TTcpCfg.Create;
begin
inherited;
TcpClientsList := TClientsTcpList.Create;
end;
destructor TTcpCfg.Destroy;
begin
TcpClientsList.Free;
inherited;
end;
function TTcpCfg.AddClient (ip : String; port: Integer) : TClientTcp;
begin
Result := TClientTcp.Create;
//TcpClientsList.Add (Result);
Result.IP := ip;
Result.Port := port;
Result.RecFrames := 0;
end;
// ============================================================ INITIALIZATION
initialization
//ServerTcpDK := TServerSocket.Create (Nil);
//TCPCFG := TTcpCfg.Create;
ClientsList := TClientsTcpList.Create;
finalization
//ServerTcpDK.Free;
//TCPCFG.Free;
ClientsList.Free;
// @END OF FILE --------------------------------------------------------------
end.
对其进行过滤:
grep
这样您就不必构建任意大的正则表达式,如果您的停用词列表很大,这可能会有问题。
答案 1 :(得分:0)
一个可行的解决方案(也在 Mac OS 中):
cat rawtext.txt | grep -o -E '[a-zA-Z]{3,}' | tr '[:upper:]' '[:lower:]' | sort | uniq | grep -vwFf stopwords.txt
这将仅提取 3 个字母的单词(不含数字),转换为小写,排序并获取唯一值,然后使用停用词进行过滤。
确保以相同方式处理 stopwords.txt(例如小写)。