尝试从字符串列表中删除所有非英语字符

时间:2018-02-07 16:43:53

标签: python python-3.x for-loop nltk

我有一个词典列表

Sub selectSharepoint()
Dim objXMLHTTP As MSXML2.XMLHTTP
Dim strBatchXml As String
Dim strSoapBody As String

UserName = "john@email.com"
Password = "123456"

Set objXMLHTTP = New MSXML2.XMLHTTP
objXMLHTTP.Open "POST", "https://ts.sharepoint.com/sites/2018_teste/_vti_bin/lists.asmx", False, UserName, Password
'objXMLHTTP.Open "POST", "https://ts.sharepoint.com/sites/2018_teste/_vti_bin/lists.asmx", False 'This way worked the same, and got the same error.
objXMLHTTP.setRequestHeader "Content-Type", "text/xml; charset=UTF-8"
objXMLHTTP.setRequestHeader "Content-Length", "length"
'objXMLHTTP.setRequestHeader "Authorization", "Basic " + Base64Encode(UserName & ":" & Password) 'makes no effect, with or without Base64Encode
objXMLHTTP.setRequestHeader "SOAPAction", "http://schemas.microsoft.com/sharepoint/soap/GetListItems"

strSoapBody = "<?xml version='1.0' encoding='utf-8'?>" & _
              "  <soap:Envelope xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xmlns:xsd='http://www.w3.org/2001/XMLSchema' xmlns:soap='http://schemas.xmlsoap.org/soap/envelope/'>" & _
              "     <soap:Body>" & _
              "         <GetListItems xmlns='http://schemas.microsoft.com/sharepoint/soap/'>" & _
              "             <listName>AllNames</listName>" & _
              "             <query>" & _
              "                 <Query xmlns=''>" & _
              "                     <Where>" & _
              "                          <Eq>" & _
              "                              <FieldRef Name='Name' />" & _
              "                              <Value Type='Text'>Cloe</Value>" & _
              "                          </Eq>" & _
              "                     </Where>" & _
              "                     <OrderBy>" & _
              "                         <FieldRef Name='ID' />" & _
              "                     </OrderBy>" & _
              "                 </Query>" & _
              "             </query>"
strSoapBody = strSoapBody & _
              "             <rowLimit>1</rowLimit>" & _
              "             <viewFields><ViewFields>" & _
              "                 <FieldRef xsi:type='s:string' Name='Name'></FieldRef>" & _
              "             </ViewFields></viewFields>" & _
              "         </GetListItems>" & _
              "     </soap:Body>" & _
              "  </soap:Envelope>"

objXMLHTTP.send strSoapBody

If objXMLHTTP.Status <> 200 Then
    MsgBox ("Error! " & Chr(10) & objXMLHTTP.responseText)
    End
Else
    tudo = objXMLHTTP.responseText
End If
End Sub

并且我无法摆脱字符和非英语单词.twitter scrape的目标是创建一个简单的单词频率计数。

有没有最好的方法来创建循环并删除所有非英语单词/字符?

这段代码我用来创建wordcount词典

Amazon 120 
b 19 
maji_opai 1 
am\xcd\x9ca\xcd\x89zon\xe2\x80\xa6 1 
\xcb\x99\xea\x92\xb3\xe2\x80\x8b\xcb\x99 1 
b'RT 46 
WorkingGIrl 1 
For 1 
people 1 
love 1 
REAL 1 
paperback 1 
THE 3 
PARIS 1 
EFFECT 1 
10 1 
right 1 

1 个答案:

答案 0 :(得分:0)

如果通过&#34; english&#34;你的意思是"ascii",然后使用string.printable这是一组所有ASCII可打印字符

import string
ascii_chars = set(string.printable)  # speeds things up
def remove_non_ascii_prinatble_from_list(list_of_words):
    return [word for word in list_of_words 
            if all(char in ascii_chars for char in word)]

或在字典上

def remove_non_ascii_prinatble_keys_from_dict(dict_of_words):
    return {key: value for key, value in dict_of_words.items() 
            if all(char in ascii_chars for char in key)}