这是我的脚本:
1)我打开一个文件并阅读;
2)将文本分成句子;
3)使句子脱句;
4)对于每个定格句子,我检索该定格并将其放入新列表中;
5)我搜索词典,以查看是否在我的每个句子的引理列表中都找到了词典中的单词;
# -*- coding: utf-8 -*-
import codecs
import re
import nltk
from nltk import tokenize
import os
import sys
import subprocess
import glob
from os import path
import pprint
import csv
import numpy as pd
from itertools import islice
try:
import treetaggerwrapper
print("import TreeTagger OK")
except:
print("Import TreeTagger pas Ok")
alphabet = "([a-z][...])"
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)[.]"
starters = "(M|Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "([0-9])"
with codecs.open('corpus','r', 'utf-8') as text:
text= text.read()
#print(text)
def split_into_sentences(text):
text = " " + text + " "
text = text.replace("\n"," ")
text = re.sub(prefixes,"\\1<prd>",text)
text = re.sub(websites,"<prd>\\1",text)
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
if "e.g." in text: text = text.replace("e.g.","e<prd>g<prd>")
if "i.e." in text: text = text.replace("i.e.","i<prd>e<prd>")
if "..." in text: text = text.replace("...","<prd><prd><prd>")
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = text.replace(".",".<stop>")
text = text.replace("?","?<stop>")
text = text.replace("!","!<stop>")
text = text.replace("<prd>",".")
text = text.replace("...","...<stop>")
text = text.replace("…","…<stop>")
sentences = text.split("<stop>")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences
sentences = split_into_sentences(text)
token= []
pos = []
lemme = []
sentences_all= []
for sentence in sentences:
tagger = treetaggerwrapper.TreeTagger(TAGLANG='fr')
tags = tagger.tag_text(sentence)
sentences_all.append(tags)
#print(sentences_all)
for list in sentences_all:
#print(list)
for mot in list:
first, second, third = mot.split()
token.append(first)
pos.append(second)
lemme.append(third)
#print(token)
#print(pos)
print(lemme)
因此,我想为每个句子提取其词根,并将其放入新列表中,因此,对于每个句子,我都有其词组列表,但是我能够做的就是存储所有句子的词组在一个唯一的列表中,那是我不想要的。我想为每个句子列出一个词素,以便计算每个句子的情感分数。
所以最后,我应该能够写东西,一句话是肯定的,等等。。。
当我打印句子标签时,我有这个标签:
['Moi\tPRO:PER\tmoi', 'je\tPRO:PER\tje', 'ne\tADV\tne', 'trouve\tVER:pres\ttrouver', 'pas\tADV\tpas', 'très\tADV\ttrès', 'esthétique\tADJ\testhétique', '.\tSENT\t.']
这是句子的方式,我想像上面所说的那样为每个句子仅退回词组,然后通过阅读我的词典来计算分数。这是我的词典代码(起初,我是为计算文本分数而编写的,但现在我需要对其进行调整以计算出句子的艺术水平:
dico = {}
#lexique = open('lexique.txt', 'rb')
with codecs.open('lexique2.txt', 'r', 'utf-8', errors = 'ignore') as lexique:
for ligne in islice(lexique, 31, None):
#print(ligne)
ligne = ligne.split(';')
#print(ligne)
#print(ligne[-4:])
dico.update({ligne[-4]:ligne[-3:]})
#print(dico)
somme_V0 = []
somme_V1 = []
somme_V2 = []
for k, v in dico.items():
if k in lemme:
#print(k,v)
somme_V0.append(int(v[0]))
somme_V1.append(int(v[1]))
somme_V2.append(int(v[2]))
#print("pos: " + str(sum(somme_V0)))
#print("neu: " + str(sum(somme_V1)))
#print("neg: " + str(sum(somme_V2)))
#Calcul de la polarité du texte :
if sum(somme_V0) > sum(somme_V1) and sum(somme_V0) > sum(somme_V2):
print("le texte a une polarité positive de " + str(sum(somme_V0)))
elif sum(somme_V1) > sum(somme_V0) and sum(somme_V1) > sum(somme_V2):
print("le texte a une polarité neutre de " + str(sum(somme_V1)))
elif sum(somme_V2) > sum(somme_V0) and sum(somme_V2) > sum(somme_V1):
print("le texte a une polarité négative de " + str(sum(somme_V2)))