我已经使用python代码将一些文本写入.txt文件。当我尝试打开.txt文件时。内容如下所示
\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00q\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00x\00\00\00\00\00\00\00\00\00\00\00g\00\00\00\00\00\00\86\90\00\00\00\00\BE\00\00\00\00\A0\90d\00c\00b\00d\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00!\A4;\A4 (
\90\00\00\00f\00\00(
\90\00\00\00\00X\00\00\00(
\90\00
\90 \00\00\9D\9Aa\A3h
\90\00\00\00y\00\00\00$
\90\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00S30YJ9BF126309 2BA30001ST1000LM024 HN-M101MBB \00P\F2Lg\D8\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\FF\FF\00\00\00\00\00\00\00}\E4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00l\90\00\00\00\00\00\00\00\00\007\90\83\00\90\00\B4\90\BCx\00u\00\00\00(\83\00\90\00\9E\90\E0x\00\00\00\008\83\00\90\00\C0\90y\00\00\00\00H\83\00\90\00\F8\90\EFE
\00\00\00X\83\00\90\00\90\D7A\00\00\00h\83\00\90\00\90c}\00\00\00\00x\83\00\90\00*\90\AF2\00\00\00\88\83\00\90\00,\90.\00\00\00\98\83\00\90\00.\90\A50\00\00\00\A8\83\00\90\002\90\91}\00\00\00\00\B8\83\00\90\002 \90Kf\00\00\00\00\00ȃ\00\90\00B \90\83z\00\00\00\00\00\90\00r \90\91z\00\00\00\00\E8\83\00\90\00\92 \90\9Dz\00\00\00\00\F8\83\00\90\00\B2 \90\A5z\002\00\00\00\98\84\00\90\00
\90\92\A1\00\00\00H\84\00\90\00l\90A0
\00\00\008\84\00\90\00Z\90-\FF\00\00\00\B8\84\00\90\00@
\90ل\00\00\00\84\00\90\00\\90\FD\FF\00\00\00X\84\00\90\00\86\90g
\00\00\00h\84\00\90\00\A0\90\89
\00\00\00x\84\00\90\00\B4\90\E1\CE\B3\00\00\00\88\84\00\90\00
\90{\9B\00\00\00\A8\84\00\90\00(
\903\A5
\00\00\00\84\00\90\00X\90\BF\00\00\00(\84\00\90\00<
\90\ADo\00\00\00\00\90\00\
\90\F3\84\B4\00\00\00\E8\84\00\90\00\C6+\90M\B6\00\00Ȅ\00\90\00\C4\90A\B6\81\00\00\F8\84\00\90\00\F6/\9072H\00\00\00\85\00\90\00\860\90\9F\AB\00\00\00\85\00\90\00\DC1\90;\E8\00\00\00(\85\00\90\00\E41\90qc\00\00\008\85\00\90\00\E61\90{c\00\00\00H\85\00\90\00\E81\90\83c\00\00\00\00\00\00\00\00\EA1\90\8Bc\00\00\00h\85\00\90\00\EC1\90
\F42\00\00\00x\85\00\90\00\EE1\90!\00\00\00\88\85\00\90\002\90\A2@\00\00\00\98\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\A8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\B8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00ȅ\00\90\00\00\00\00\00\00\00\00\00\00\00\00\00\90\00\00\00\00\00\00\00\00\00\00\00\00\E8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\F8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00(\86\00\90\00\00\00\00\00\00\00\00\00\00\00\008\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00H\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00X\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00h\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00x\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\88\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\98\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\A8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\B8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00Ȇ\00\90\00\00\00\00\00\00\00\00\00\00\00\00؆\00\90\00\00\00\00\00\00\00\00\00\00\00\00\E8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\F8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00X\85\00\90\83\00\90\00\B4\90}\00\00\E6f\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A9\EE\00\00\00
写代码:
# coding: utf-8 -*-
import dataPreprocessing
import re,os
##from nltk.corpus import stopwords
##from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
st = PorterStemmer()
def positive_test_unigram_with_stopword_record():
data1=""
path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/BST_unigram.txt"
Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/POSITIVE/TESTING/"
path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/"
d = os.path.dirname(path_to_write_processdata)
if not os.path.exists(d) :
os.makedirs(d)
for i in range(1,501):
files=Actual_path+str(i)+".txt"
files1=path_to_write_processdata+str(i)+".txt"
fileread=open(files,"r")
file_content=fileread.read()
## Reading content of files ###
data_After_caseconversion=dataPreprocessing.upperTolower(file_content)
data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
## print data_After_normalize
data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)
data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)
data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
data = re.sub( '\s+', ' ',data_After_non_ascii1)
actual_data=dataPreprocessing.nltk_stopwords_removal(data)
actual_data=[x for x in actual_data if x]
actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')
dataPreprocessing.File_Write(actual_data,files1)
data1=data1+str(actual_data)+" "
dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
fileread.close()
def negative_test_unigram_with_stopword_record():
data1=""
path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/BST_unigram.txt"
Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/NEGATIVE/TESTING/"
path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/"
d = os.path.dirname(path_to_write_processdata)
if not os.path.exists(d) :
os.makedirs(d)
for i in range(1,501):
files=Actual_path+str(i)+".txt"
files1=path_to_write_processdata+str(i)+".txt"
fileread=open(files,"r")
file_content=fileread.read()
## Reading content of files ###
data_After_caseconversion=dataPreprocessing.upperTolower(file_content)
data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
## print data_After_normalize
data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)
data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)
data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
data = re.sub( '\s+', ' ',data_After_non_ascii1)
actual_data=dataPreprocessing.nltk_stopwords_removal(data)
actual_data=[x for x in actual_data if x]
actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')
dataPreprocessing.File_Write(actual_data,files1)
data1=data1+str(actual_data)+" "
dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
fileread.close()
#............................................................................................................................................##
def positive_unigram_with_stopword_record():
data1=""
path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/BST_unigram.txt"
Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/POSITIVE/TRAINING/"
path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/"
d = os.path.dirname(path_to_write_processdata)
if not os.path.exists(d) :
os.makedirs(d)
for i in range(1,501):
## for i in range(1,501):
files=Actual_path+str(i)+".txt"
files1=path_to_write_processdata+str(i)+".txt"
fileread=open(files,"r")
file_content=fileread.read()
## Reading content of files ###
data_After_caseconversion=dataPreprocessing.upperTolower(file_content)
data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
## print data_After_normalize
data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)
data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)
data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
data = re.sub( '\s+', ' ',data_After_non_ascii1)
actual_data=dataPreprocessing.nltk_stopwords_removal(data)
actual_data=[x for x in actual_data if x]
actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')
dataPreprocessing.File_Write(actual_data,files1)
data1=data1+str(actual_data)+" "
dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
fileread.close()
def negative_unigram_with_stopword_record():
data1=""
path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/BST_unigram.txt"
Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/NEGATIVE/TRAINING/"
path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/"
d = os.path.dirname(path_to_write_processdata)
if not os.path.exists(d) :
os.makedirs(d)
for i in range(1,501):
files=Actual_path+str(i)+".txt"
files1=path_to_write_processdata+str(i)+".txt"
fileread=open(files,"r")
file_content=fileread.read()
## Reading content of files ###
data_After_caseconversion=dataPreprocessing.upperTolower(file_content)
data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
## print data_After_normalize
data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)
data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)
data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
data = re.sub( '\s+', ' ',data_After_non_ascii1)
actual_data=dataPreprocessing.nltk_stopwords_removal(data)
actual_data=[x for x in actual_data if x]
actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')
dataPreprocessing.File_Write(actual_data,files1)
data1=data1+str(actual_data)+" "
dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
fileread.close()
############## Positive data ####################
positive_unigram_with_stopword_record()
positive_test_unigram_with_stopword_record()
print " Extracted unigram_with_stemming_without_stopword of MOVIE_Postive_training and testing "
########## Negative data ####################
negative_test_unigram_with_stopword_record()
negative_unigram_with_stopword_record()
print " Extracted unigram_with_stemming_without_stopword of MOVIE_Negative_training and testing"
dataProcessing.py
#-*- coding: utf-8 -*-
from __future__ import division
import nltk
import csv
import re, string, timeit
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
import csv
import nltk
import unicodedata
from string import *
from collections import OrderedDict
from nltk.corpus import stopwords
import fileinput
import re
import sys
from itertools import groupby, product
import enchant # $ pip install pyenchant
def remove_consecutive_dups(s):
return re.sub(r'(?i)(.)\1+', r'\1', s)
def all_consecutive_duplicates_edits(word, max_repeat=float('inf')):
chars = [[c*i for i in range(min(len(list(dups)), max_repeat), 0, -1)]
for c, dups in groupby(word)]
return map(''.join, product(*chars))
#############################################################################################################
#### N-grams ###
#############################################################################################################
def ngram1(tokens,Ngram):
ngramss = ngrams(tokens,int(Ngram))
## print "******************"+str(Ngram)+"**************************"
## print ngramss
## print "*************************************************"
return(ngramss)
############################ File writting fucntion ##################
def File_Write_main(write_ist,filename):
filewrite=open(filename,"w")
filewrite.write(str(write_ist))
filewrite.close()
return
############################ File writting fucntion ##################
def File_Write(write_ist,filename):
filewrite=open(filename,"w")
filewrite.write(str(write_ist))
filewrite.close()
return
def stopwords_removal(data):
operators = set(('nor','none', 'no', 'not'))
stop = set(nltk.corpus.stopwords.words('english')) - operators
bi=[]
a=data.lower().split(' ')
for i in range(0,len(a)):
if a[i] not in list(stop):
bi.append(a[i])
return bi
def nltk_stopwords_removal(data):
word_list=data.split(" ")
filtered_words = [w for w in word_list if not w in stopwords.words('english')]
return filtered_words
def File_Write_Append_mode(write_ist,filename):
filewrite=open(filename,"a")
filewrite.write(write_ist)
filewrite.close()
return
############################ File writting fucntion ##################
def File_Read(filename):
fileread=open(filename,"r")
read_list=fileread.read()
fileread.close()
return(read_list)
################################################################
## :::::::Converting Data from Uppercase to Lowercase::::: ##
################################################################
def upperTolower(data):
data=str(data)
data= data.lower()
return(data)
################################################################
## :::::::After Normalization of each words: ##
################################################################
def normalize(data):
data = re.sub("n`t|n't"," not",data)
data = re.sub("wud","would",data)
data = re.sub("bcoz|bcause","because",data)
data = re.sub("thanx|thankz","thanks",data)
data = re.sub("aka","also known as",data)
data = re.sub("i'm|i`m","i am",data)
data = re.sub("it's|it`s","it is",data)
data = re.sub("can't|can`t|cannot","can not",data)
data = re.sub("couldnt|couldn't|couldn'","could not",data)
data = re.sub("wouldnt|wouldn't|wouldn'","would not",data)
data = re.sub("i've|i`ve","i have",data)
data = re.sub("don't|don't|dont","do not",data)
data = re.sub("doesn't|doesnt","does not",data)
data = re.sub("didn't|didn't|didnt","did not",data)
data = re.sub("won't|won't|wont","will not",data)
data = re.sub("isn't|isn`t|isnt","is not",data)
data = re.sub("i'll|i`ll","i will",data)
data = re.sub("you're|u're|u'r","you are",data)
data = re.sub("i've|i`ve","i have",data)
data = re.sub("it's|it`s","it is",data)
data = re.sub("i'd|i`d","i would",data)
data = re.sub("it'd|it`d","it would",data)
data = re.sub("we're|v`re","we are",data)
data = re.sub("wasn't|wasnt|wasn’t","was not",data)
data = re.sub("hadnt|hadn't","had not",data)
data = re.sub("you'd|u`d","you had",data)
data = re.sub("there've","there have",data)
data = re.sub("they've|theyve","they have",data)
data = re.sub("you've|uve|u've|youve","you have",data)
data = re.sub("they're","they are",data)
data = re.sub("'s"," is",data)
data = re.sub("'ve"," have",data)
data = re.sub("maybe|mayb","may be",data)
data=re.sub("you'll","you will",data)
return data
def remove_words_less_than_2(data):
a=' '.join(word for word in data if len(word)>=2)
## a=' '.join(word for word in data if len(word)>2)
return a
###############################################################
## ::::::::Remove space:::::: #
###############################################################
def remove_digits(s):
data=re.sub(r"[0-9]+",'',s)
## data= re.sub(" \d+"," ", s)
## data=re.sub("[^A-Za-z ]", "", s)
return data
##
################################################################
## :::::::Non-ASCII Removal::::::: ##
################################################################
##def strip_non_ascii(string):
## for c in string:
## if (0<ord(c)>127):
## string=string.replace(c,' ')
## return string
def strip_non_ascii(string):
data=""
''' Returns the string without non ASCII characters'''
data=(c for c in string if 0 < ord(c) < 127)
return ''.join(data)
def stripEscape(s):
## """ Removes all escape sequences from the input string """
## delete = ""
## i=1
## while (i<0x20):
## delete += chr(i)
## i += 1
## t = string.translate(None, delete)
## return t
## ansi_escape = re.compile(r'\x1b[^m]*m')
##
## a=ansi_escape.sub('', string)
a=filter(lambda x: x in string.printable, s)
return a
def remove_zeroes(sent):
a="0"
data=(c for c in sent if c==a)
return ''.join(data)
################################################################
## :::::::Punctuation Removal::::::::::::::: ##
################################################################
##def punctuation_removal(punc_remove):
## predicate = lambda x:x not in string.punctuation
## data=filter(predicate,punc_remove)
## return data
################################################################
## :::::::Punctuation Removal::::::::::::::: ##
################################################################
def punctuation_removal(punc_remove):
regex = re.compile('[%s]' % re.escape(string.punctuation))
out = regex.sub(' ', punc_remove)
return out
################################################################
## :::::::Removing new lines:::::::::::::::: ##
################################################################
def remove_newlines(data):
## data=re.sub('[\n]+', ' ', data)
## data=re.sub('[\s\b]+', ' ', data)
## data=data.rstrip()
re.sub('[\s]+|[\n]+',' ',data)
return data
################################################################
## :::::::Tokenization using space:::::::::: ##
################################################################
def tokenize(data):
data = re.sub("0|1|2|3|4|5|6|7|8|9"," ",data)
## my_string=[x.replace("0|1|2|3|4|5|6|7|8|9"," ") for x in data.split(' ')]
data= data.split(' ')
return data
################################################################
## :::::::POS tagging using PennTree Bank Tagger ##
## :::::::Write POS tag into a file::::::::: ##
################################################################
def PennTreeBank_POSTagger(data):
postag = nltk.pos_tag(data)
return(postag)
################################################################
## :::::::Remove elongated string ##
################################################################
def remove_elogatedStrings(data):
words = enchant.Dict("en")
is_known_word = words.check
#NOTE: unnecessary work, optimize if needed
output = [next((e for e in all_consecutive_duplicates_edits(s)
if e and is_known_word(e)), remove_consecutive_dups(s))
for s in re.split(r'(\W+)', data)]
uniq=[]
for i in output:
if i not in uniq:
uniq.append(i)
return " ".join(uniq)
阅读代码:
path="/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TRAINING/"+foldername+"/"
for i1 in range(1,501):
frqlist=[]
root.reset_frequency()
input_file=path+str(i1)+".txt"
input_file_ptr=open(input_file,"r")
read_content=input_file_ptr.read()
read_content=read_content.strip()
read_content=read_content.split(" ")
如何使用python正确读取此文件?有谁知道这个?请帮帮我吗?