Question

当我尝试添加以下代码时，它会给我一个错误我安装了每个 python 模块，包括nltk。我添加了lxml nampy，但它不会起作用。我正在使用 python3 ，在这种情况下，我已将urllib2更改为urllib.requests。
请帮我找一个解决方案我正在运行这个

python index.py

我的索引文件如下。这是代码：

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import ssl
import os 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import codecs 


def checkChar(token):
    for char in token:
        if(0 <= ord(char) and ord(char) <= 64) or (91 <= ord(char) and ord(char) <= 96) or (123 <= ord(char)):
            return False 
        else:
            continue

    return True 

def cleanMe(html):
    soup = BeautifulSoup(html, "html.parser")
    for script in soup(["script, style"]):
        script.extract()

    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())

    chunks = (phrase.strip() for line in lines for phrase in line.split(" "))

    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

path = 'crawled_html_pages/'
index = {}
docNum = 0 
stop_words = set(stopwords.words('english'))

for filename in os.listdir(path):

    collection = {}

    docNum += 1

    file = codecs.open('crawled_html_pages/' + filename, 'r', 'utf-8')

    page_text = cleanMe(file)

    tokens = nltk.word_tokenize(page_text)

    filtered_sentence = [w for w in tokens if not w in stop_words]

    filtered_sentence = []

    breakWord = ''

    for w in tokens:
        if w not in stop_words:
            filtered_sentence.append(w.lower())

    for token in filtered_sentence:
        if len(token) == 1 or token == 'and':
            continue
        if checkChar(token) == false:
            continue
        if token == 'giants':
            breakWord = token
            continue
        if token == 'brady' and breakWord == 'giants':
            break
        if token not in collection:
            collection[token] = 0
        collection[token] += 1

    for token in collection:
        if tokennot in index:
            index[token] = ''
        index[token] = index[token] + '(' + str(docNum) + ', ' + str(collection[token]) + ")"

    if docNum == 500:
        print(index)
        break
    else:
        continue

    f = open('index.txt', 'w')
    vocab = open('uniqueWords.txt', 'w')
    for term in index:
    f.write(term + ' =>' + index[term])
    vocab.write(term + '\n')
    f.write('\n')
    f.close()
    vocab.close()

     print('Finished...')

这些是我得到的错误：

C：\ Users \ myworld＆gt; python index.py
      追溯（最近的呼叫最后）：
      文件＆＃34; index.py] [1]＆＃34;，第49行，在       page_text = cleanMe（文件）
      文件＆＃34; index.py＆＃34;，第22行，在cleanMe中       汤= BeautifulSoup（html，＆＃34; html.parser＆＃34;）
    文件＆＃34; C：\ Users \ furqa \ AppData \ Local \ Programs \ Python \ Python36-32 \ lib \ site-packages \ beautifulsoup4-4.6.0-py3.6.egg \ bs4__init __。py＆＃34;，line 191，在 init
中     文件＆＃34; C：\ Users \ furqa \ AppData \ Local \ Programs \ Python \ Python36-32 \ lib \ codecs.py＆＃34;，第700行，读取
      return self.reader.read（size）
    文件＆＃34; C：\ Users \ furqa \ AppData \ Local \ Programs \ Python \ Python36-32 \ lib \ codecs.py＆＃34;，第503行，读取
      newchars，decodingbytes = self.decode（data，self.errors）   UnicodeDecodeError：＆＃39; utf-8＆＃39;编解码器不能解码位置3131中的字节0x80：        起始字节无效

Answer 1

您可以通过更改from_encoding参数来更改使用的BeautifulSoup编码类型：

汤= BeautifulSoup（html，from_encoding =“ iso-8859-8”）

UnicodeDecodeError：＆＃39; utf-8＆＃39;编解码器不能解码位置3131中的字节0x80：我的代码中的无效起始字节

1 个答案: