在python中将文本转换为ascii

时间:2014-11-06 23:13:25

标签: python python-2.7 character-encoding

我已经使用python代码将一些文本写入.txt文件。当我尝试打开.txt文件时。内容如下所示

    \00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00q\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00x\00\00\00\00\00\00\00\00\00\00\00g\00\00\00\00\00\00\86\90\00\00\00\00\BE\00\00\00\00\A0\90d\00c\00b\00d\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00!\A4;\A4 (
\90\00\00\00f\00\00(
\90\00\00\00\00X\00\00\00(
\90\00
\90 \00\00\9D\9Aa\A3h 
\90\00\00\00y\00\00\00$
\90\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00S30YJ9BF126309      2BA30001ST1000LM024 HN-M101MBB                  \00P\F2Lg\D8\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\FF\FF\00\00\00\00\00\00\00}\E4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00l\90\00\00\00\00\00\00\00\00\007\90\83\00\90\00\B4\90\BCx\00u\00\00\00(\83\00\90\00\9E\90\E0x\00\00\00\008\83\00\90\00\C0\90y\00\00\00\00H\83\00\90\00\F8\90\EFE
\00\00\00X\83\00\90\00\90\D7A\00\00\00h\83\00\90\00\90c}\00\00\00\00x\83\00\90\00*\90\AF2\00\00\00\88\83\00\90\00,\90.\00\00\00\98\83\00\90\00.\90\A50\00\00\00\A8\83\00\90\002\90\91}\00\00\00\00\B8\83\00\90\002  \90Kf\00\00\00\00\00ȃ\00\90\00B \90\83z\00\00\00\00؃\00\90\00r  \90\91z\00\00\00\00\E8\83\00\90\00\92   \90\9Dz\00\00\00\00\F8\83\00\90\00\B2   \90\A5z\002\00\00\00\98\84\00\90\00
\90\92\A1\00\00\00H\84\00\90\00l\90A0
\00\00\008\84\00\90\00Z\90-\FF\00\00\00\B8\84\00\90\00@
\90ل\00\00\00\84\00\90\00\\90\FD\FF\00\00\00X\84\00\90\00\86\90g
\00\00\00h\84\00\90\00\A0\90\89
\00\00\00x\84\00\90\00\B4\90\E1\CE\B3\00\00\00\88\84\00\90\00
\90{\9B\00\00\00\A8\84\00\90\00(
\903\A5
\00\00\00\84\00\90\00X\90\BF\00\00\00(\84\00\90\00<
\90\ADo\00\00\00؄\00\90\00\
\90\F3\84\B4\00\00\00\E8\84\00\90\00\C6+\90M\B6\00\00Ȅ\00\90\00\C4\90A\B6\81\00\00\F8\84\00\90\00\F6/\9072H\00\00\00\85\00\90\00\860\90\9F\AB\00\00\00\85\00\90\00\DC1\90;\E8\00\00\00(\85\00\90\00\E41\90qc\00\00\008\85\00\90\00\E61\90{c\00\00\00H\85\00\90\00\E81\90\83c\00\00\00\00\00\00\00\00\EA1\90\8Bc\00\00\00h\85\00\90\00\EC1\90
\F42\00\00\00x\85\00\90\00\EE1\90!\00\00\00\88\85\00\90\002\90\A2@\00\00\00\98\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\A8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\B8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00ȅ\00\90\00\00\00\00\00\00\00\00\00\00\00\00؅\00\90\00\00\00\00\00\00\00\00\00\00\00\00\E8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\F8\85\00\90\00\00\00\00\00\00\00\00\00\00\00\00\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00(\86\00\90\00\00\00\00\00\00\00\00\00\00\00\008\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00H\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00X\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00h\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00x\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\88\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\98\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\A8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\B8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00Ȇ\00\90\00\00\00\00\00\00\00\00\00\00\00\00؆\00\90\00\00\00\00\00\00\00\00\00\00\00\00\E8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\F8\86\00\90\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00X\85\00\90\83\00\90\00\B4\90}\00\00\E6f\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A9\EE\00\00\00

写代码:

#  coding: utf-8 -*-
import dataPreprocessing
import re,os
##from nltk.corpus import stopwords
##from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
st = PorterStemmer()
def positive_test_unigram_with_stopword_record():
    data1=""
    path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/BST_unigram.txt"
    Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/POSITIVE/TESTING/"
    path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/"
    d = os.path.dirname(path_to_write_processdata)
    if not os.path.exists(d) :
        os.makedirs(d)
    for i in range(1,501):
        files=Actual_path+str(i)+".txt"
        files1=path_to_write_processdata+str(i)+".txt"
        fileread=open(files,"r")
        file_content=fileread.read()

##  Reading content of files ###       
        data_After_caseconversion=dataPreprocessing.upperTolower(file_content)

        data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
##        print data_After_normalize
        data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)        
        data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)

        data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
        data =  re.sub( '\s+', ' ',data_After_non_ascii1)

        actual_data=dataPreprocessing.nltk_stopwords_removal(data)
        actual_data=[x for x in actual_data if x]
        actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
        actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')

        dataPreprocessing.File_Write(actual_data,files1)
        data1=data1+str(actual_data)+" "


    dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
    fileread.close()

def negative_test_unigram_with_stopword_record():
    data1=""
    path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/BST_unigram.txt"
    Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/NEGATIVE/TESTING/"
    path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TESTING/unigram_with_stemming_without_stopword/"
    d = os.path.dirname(path_to_write_processdata)
    if not os.path.exists(d) :
        os.makedirs(d)
    for i in range(1,501):
        files=Actual_path+str(i)+".txt"
        files1=path_to_write_processdata+str(i)+".txt"
        fileread=open(files,"r")
        file_content=fileread.read()        
##  Reading content of files ###       
        data_After_caseconversion=dataPreprocessing.upperTolower(file_content)

        data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
##        print data_After_normalize
        data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)        
        data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)

        data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
        data =  re.sub( '\s+', ' ',data_After_non_ascii1)

        actual_data=dataPreprocessing.nltk_stopwords_removal(data)
        actual_data=[x for x in actual_data if x]
        actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
        actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')

        dataPreprocessing.File_Write(actual_data,files1)
        data1=data1+str(actual_data)+" "


    dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
    fileread.close()
#............................................................................................................................................##

def positive_unigram_with_stopword_record():
    data1=""
    path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/BST_unigram.txt"
    Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/POSITIVE/TRAINING/"
    path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/"
    d = os.path.dirname(path_to_write_processdata)
    if not os.path.exists(d) :
        os.makedirs(d)
    for i in range(1,501):
##       for i in range(1,501):
        files=Actual_path+str(i)+".txt"
        files1=path_to_write_processdata+str(i)+".txt"
        fileread=open(files,"r")
        file_content=fileread.read()        
##  Reading content of files ###       
        data_After_caseconversion=dataPreprocessing.upperTolower(file_content)

        data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
##        print data_After_normalize
        data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)        
        data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)

        data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
        data =  re.sub( '\s+', ' ',data_After_non_ascii1)

        actual_data=dataPreprocessing.nltk_stopwords_removal(data)
        actual_data=[x for x in actual_data if x]
        actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
        actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')

        dataPreprocessing.File_Write(actual_data,files1)
        data1=data1+str(actual_data)+" "


    dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
    fileread.close()

def negative_unigram_with_stopword_record():
    data1=""
    path_to_BST_unigram="/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/BST_unigram.txt"
    Actual_path = "/home/user/Final_Thesis/MOVIE/Dataset/NEGATIVE/TRAINING/"
    path_to_write_processdata = "/home/user/Final_Thesis/MOVIE/NEGATIVE_FEATURE/TRAINING/unigram_with_stemming_without_stopword/"
    d = os.path.dirname(path_to_write_processdata)
    if not os.path.exists(d) :
        os.makedirs(d)
    for i in range(1,501):
        files=Actual_path+str(i)+".txt"
        files1=path_to_write_processdata+str(i)+".txt"
        fileread=open(files,"r")
        file_content=fileread.read()        
##  Reading content of files ###       
        data_After_caseconversion=dataPreprocessing.upperTolower(file_content)

        data_After_normalize=dataPreprocessing.normalize(data_After_caseconversion)
##        print data_After_normalize
        data_After_non_ascii=dataPreprocessing.punctuation_removal(data_After_normalize)        
        data_After_non_ascii1=dataPreprocessing.remove_digits(data_After_non_ascii)

        data_After_non_ascii1=dataPreprocessing.stripEscape(data_After_non_ascii1)
        data =  re.sub( '\s+', ' ',data_After_non_ascii1)

        actual_data=dataPreprocessing.nltk_stopwords_removal(data)
        actual_data=[x for x in actual_data if x]
        actual_data=dataPreprocessing.remove_words_less_than_2(actual_data)
        actual_data=str(actual_data).replace("[",'').replace("]",'').replace("'",'').replace(",",'')

        dataPreprocessing.File_Write(actual_data,files1)
        data1=data1+str(actual_data)+" "


    dataPreprocessing.File_Write_Append_mode(data1,path_to_BST_unigram)
    fileread.close()

##############   Positive data ####################

positive_unigram_with_stopword_record()
positive_test_unigram_with_stopword_record()
print " Extracted unigram_with_stemming_without_stopword of MOVIE_Postive_training and testing "

##########   Negative data ####################



negative_test_unigram_with_stopword_record()
negative_unigram_with_stopword_record()
print " Extracted unigram_with_stemming_without_stopword of MOVIE_Negative_training and testing"

dataProcessing.py

   #-*- coding: utf-8 -*-
from __future__ import division
import nltk
import csv
import re, string, timeit
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.util import ngrams
import csv
import nltk
import unicodedata
from string import *
from collections import OrderedDict
from nltk.corpus import stopwords
import fileinput
import re
import sys
from itertools import groupby, product
import enchant # $ pip install pyenchant
def remove_consecutive_dups(s):
   return re.sub(r'(?i)(.)\1+', r'\1', s)

def all_consecutive_duplicates_edits(word, max_repeat=float('inf')):
   chars = [[c*i for i in range(min(len(list(dups)), max_repeat), 0, -1)]
   for c, dups in groupby(word)]
   return map(''.join, product(*chars))


#############################################################################################################
####                                    N-grams                                                           ###
#############################################################################################################
def ngram1(tokens,Ngram):
    ngramss = ngrams(tokens,int(Ngram))
##    print "******************"+str(Ngram)+"**************************"
##    print ngramss
##    print "*************************************************"
    return(ngramss)

############################    File writting fucntion  ##################
def File_Write_main(write_ist,filename):
    filewrite=open(filename,"w")
    filewrite.write(str(write_ist))
    filewrite.close()
    return      

############################    File writting fucntion  ##################
def File_Write(write_ist,filename):
    filewrite=open(filename,"w")
    filewrite.write(str(write_ist))
    filewrite.close()
    return
def stopwords_removal(data):
   operators = set(('nor','none', 'no', 'not'))
   stop = set(nltk.corpus.stopwords.words('english')) - operators
   bi=[]
   a=data.lower().split(' ')
   for i in range(0,len(a)):
    if a[i] not in list(stop):
        bi.append(a[i])
   return bi

def nltk_stopwords_removal(data):

   word_list=data.split(" ")
   filtered_words = [w for w in word_list if not w in stopwords.words('english')]
   return filtered_words


def File_Write_Append_mode(write_ist,filename):
    filewrite=open(filename,"a")
    filewrite.write(write_ist)
    filewrite.close()
    return

############################    File writting fucntion  ##################
def File_Read(filename):
    fileread=open(filename,"r")
    read_list=fileread.read()
    fileread.close()
    return(read_list)
################################################################
## :::::::Converting Data from Uppercase to Lowercase:::::    ##
################################################################
def upperTolower(data):
    data=str(data)
    data= data.lower()
    return(data)

################################################################
## :::::::After Normalization of each words:                  ##
################################################################
def normalize(data):
    data = re.sub("n`t|n't"," not",data)
    data = re.sub("wud","would",data)
    data = re.sub("bcoz|bcause","because",data)
    data = re.sub("thanx|thankz","thanks",data)
    data = re.sub("aka","also known as",data)
    data = re.sub("i'm|i`m","i am",data)
    data = re.sub("it's|it`s","it is",data)
    data = re.sub("can't|can`t|cannot","can not",data)
    data = re.sub("couldnt|couldn't|couldn'","could not",data)
    data = re.sub("wouldnt|wouldn't|wouldn'","would not",data)
    data = re.sub("i've|i`ve","i have",data)
    data = re.sub("don't|don't|dont","do not",data)
    data = re.sub("doesn't|doesnt","does not",data)
    data = re.sub("didn't|didn't|didnt","did  not",data)
    data = re.sub("won't|won't|wont","will not",data)
    data = re.sub("isn't|isn`t|isnt","is not",data)
    data = re.sub("i'll|i`ll","i will",data)
    data = re.sub("you're|u're|u'r","you are",data)
    data = re.sub("i've|i`ve","i have",data)
    data = re.sub("it's|it`s","it is",data)
    data = re.sub("i'd|i`d","i would",data)
    data = re.sub("it'd|it`d","it would",data)
    data = re.sub("we're|v`re","we are",data)
    data = re.sub("wasn't|wasnt|wasn’t","was  not",data)
    data = re.sub("hadnt|hadn't","had not",data)
    data = re.sub("you'd|u`d","you had",data)
    data = re.sub("there've","there have",data)
    data = re.sub("they've|theyve","they have",data)
    data = re.sub("you've|uve|u've|youve","you have",data)
    data = re.sub("they're","they are",data)
    data = re.sub("'s"," is",data)
    data = re.sub("'ve"," have",data)
    data = re.sub("maybe|mayb","may be",data)
    data=re.sub("you'll","you will",data)
    return data

def remove_words_less_than_2(data):    
    a=' '.join(word for word in data if len(word)>=2)
##    a=' '.join(word for word in data if len(word)>2)
    return a


###############################################################
##         ::::::::Remove space::::::                          #
###############################################################
def remove_digits(s):
    data=re.sub(r"[0-9]+",'',s)
##    data= re.sub(" \d+"," ", s)    
##    data=re.sub("[^A-Za-z ]", "", s)
    return data
##
################################################################
##          :::::::Non-ASCII Removal:::::::                   ##
################################################################
##def strip_non_ascii(string):
##   for c in string:
##      if (0<ord(c)>127):
##         string=string.replace(c,' ')
##   return string

def strip_non_ascii(string):
    data=""
    ''' Returns the string without non ASCII characters'''
    data=(c for c in string if 0 < ord(c) < 127)
    return ''.join(data)


def stripEscape(s):
##    """ Removes all escape sequences from the input string """
##    delete = ""
##    i=1
##    while (i<0x20):
##        delete += chr(i)
##        i += 1
##    t = string.translate(None, delete)
##    return t

##    ansi_escape = re.compile(r'\x1b[^m]*m')
##    
##    a=ansi_escape.sub('', string)
    a=filter(lambda x: x in string.printable, s)
    return a


def  remove_zeroes(sent):
    a="0"
    data=(c for c in sent if c==a)
    return ''.join(data)

################################################################
##       :::::::Punctuation Removal:::::::::::::::            ##
################################################################
##def punctuation_removal(punc_remove):
##    predicate = lambda x:x not in string.punctuation
##    data=filter(predicate,punc_remove)
##    return data

################################################################
##       :::::::Punctuation Removal:::::::::::::::            ##
################################################################
def punctuation_removal(punc_remove):
   regex = re.compile('[%s]' % re.escape(string.punctuation))
   out = regex.sub(' ', punc_remove)
   return out

################################################################
##       :::::::Removing new lines::::::::::::::::            ##
################################################################
def remove_newlines(data):
##    data=re.sub('[\n]+', ' ', data)
##    data=re.sub('[\s\b]+', ' ', data)
##    data=data.rstrip()
    re.sub('[\s]+|[\n]+',' ',data)
    return data

################################################################
##       :::::::Tokenization using space::::::::::            ##
################################################################
def tokenize(data):
    data = re.sub("0|1|2|3|4|5|6|7|8|9"," ",data)
##    my_string=[x.replace("0|1|2|3|4|5|6|7|8|9"," ") for x in data.split(' ')]
    data= data.split(' ')
    return data
################################################################
##       :::::::POS tagging using PennTree Bank Tagger        ##
##       :::::::Write POS tag into a file:::::::::            ##
################################################################
def PennTreeBank_POSTagger(data):
    postag = nltk.pos_tag(data)
    return(postag)

################################################################
##       :::::::Remove elongated string                   ##
################################################################
def remove_elogatedStrings(data):
   words = enchant.Dict("en")
   is_known_word = words.check
   #NOTE: unnecessary work, optimize if needed
   output = [next((e for e in all_consecutive_duplicates_edits(s)
   if e and is_known_word(e)), remove_consecutive_dups(s))
   for s in re.split(r'(\W+)', data)]
   uniq=[]
   for i in output:
               if i not in uniq:
                   uniq.append(i)
   return " ".join(uniq)

阅读代码:

 path="/home/user/Final_Thesis/MOVIE/POSITIVE_FEATURE/TRAINING/"+foldername+"/"
                 for i1 in range(1,501):
                    frqlist=[]
                    root.reset_frequency()
                    input_file=path+str(i1)+".txt"

                    input_file_ptr=open(input_file,"r")
                    read_content=input_file_ptr.read()

                    read_content=read_content.strip()
                    read_content=read_content.split(" ")

如何使用python正确读取此文件?有谁知道这个?请帮帮我吗?

0 个答案:

没有答案