Question

我试图运行下面的代码来清理txt文件中的一组推文

我也在命令行中定义了参数，但似乎没有输出

知道我可能做错了吗？

以下是代码：

代码：

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import os
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import pos_tag


def clean(path, filename):

    # print("Cleaning "+path)

    filename = CLEANED_DATA + filename.strip()
    WRITE_HANDLER = open(filename, 'wb')
    tweets = dict()
    for line in open('/Users/Mustafa/Desktop/nexalogy/project3.txt',
                 'rb'):
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line,
                  flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i \
                not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
           WRITE_HANDLER.write(new_line + '''

''')
    return filename


DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
for (root, dirs, files) in os.walk(DATA_FOLDER):  # gets all the files from 
subfolders recrsively
    for name in files:
        absolute_path = os.path.join(root, name)
        if os.path.isfile(absolute_path) and name != '.DS_Store':
        filename = clean(absolute_path, name)

文件：Project3.txt

{＆＃34; created_at＆＃34;：＆＃34; Tue Oct 04 17:16:30 +0000 2016＆＃34;，＆＃34; id＆＃34;：783355126945722368，＆＃34; id_str＆＃34;：＆＃34; 783355126945722368＆＃34;，＆＃34; text＆＃34;：＆＃34; RT @Jacquiecharles：USAID为人道主义合作伙伴（不是GOH）提供400,000美元的初步援助，以迅速提供重要的救济。 \ U2026＆＃34;＆＃34;截断＆＃34;：假，＆＃34;实体＆＃34; {＆＃34;＃标签＆＃34;：[]，＆＃34;符号＆＃34;：[] ，＆＃34; user_mentions＆＃34;：[{＆＃34; screen_name＆＃34;：＆＃34; Jacquiecharles＆＃34;，＆＃34; name＆＃34;：＆＃34; Jacqueline Charles＆＃34;，＆＃34; ID＆＃34;：15360434，＆＃34; ID_STR＆＃34;：＆＃34; 15360434＆＃34;＆＃34;指数＆＃34;：[3,18]}]，＆＃34;网址＆＃34;：[]}，＆＃34;元数据＆＃34; {＆＃34; iso_language_code＆＃34;：＆＃34;恩＆＃34;＆＃34; result_type的＆＃34;：＆＃34;最近＆＃34;}，＆＃34;来源＆＃34;：＆＃34; Twitter for iPhone＆lt; \ / a＆gt;＆＃34;，＆＃34; in_reply_to_status_id＆＃34;：null，＆＃34; in_reply_to_status_id_str＆＃34; ：空，＆＃34; in_reply_to_user_id＆＃34;：空，＆＃34; in_reply_to_user_id_str＆＃34;：空，＆＃34; in_reply_to_screen_name＆＃34;：空，＆＃34;使用者＆＃34; {＆＃3 4; id＆＃34;：635031678，＆＃34; id_str＆＃34;：＆＃34; 635031678＆＃34;，＆＃34; name＆＃34;：＆＃34; Tracie Hamilton＆＃34;，＆＃34; screen_name＆＃34;：＆＃34; TracieHamilton8＆＃34;，＆＃34; location＆＃34;：＆＃34;＆＃34;，＆＃34; description＆＃34;：＆＃34; Leaning＆amp;每天取决于他＆＃34;，＆＃34; url＆＃34;：null，＆＃34; entity＆＃34;：{＆＃34; description＆＃34;：{＆＃34; urls＆＃34;：[] }}，＆＃34;保护＆＃34;：假，＆＃34; FOLLOWERS_COUNT＆＃34;：1929，＆＃34; FRIENDS_COUNT＆＃34;：715，＆＃34; listed_count＆＃34;：63，＃34＆ ; created_at＆＃34;：＆＃34; Fri Jul 13 23:39:46 +0000 2012＆＃34;，＆＃34; favourites_count＆＃34;：27603，＆＃34; utc_offset＆＃34;：null，＆＃34 ;的time_zone＆＃34;：空，＆＃34; geo_enabled＆＃34;：真，＆＃34;验证＆＃34;：假，＆＃34; statuses_count＆＃34;：17433＆＃34;朗＆＃34 ;: ＆＃34;恩＆＃34;＆＃34; contributors_enabled＆＃34;：假，＆＃34; is_translator＆＃34;：假，＆＃34; is_translation_enabled＆＃34;：假，＆＃34; profile_background_color＆＃34; ：＆＃34; C0DEED＆＃34;＆＃34; profile_background_image_url＆＃34;：＆＃34; HTTP：\ / \ / abs.twimg.com \ /图像\ /主题\ / THEME1 \ /bg.png" ;，＆＃34; profile_background_image_url_https＆＃34;：＆＃34; HTTPS：\ / \ / abs.twimg.com \ /图像\ /主题\ / THEME1 \ /bg.png"，＆＃34; profile_background_tile＆＃ 34;：假，＆＃34; profile_image_url＆＃34;：＆＃34; HTTP：\ / \ / pbs.twimg.com \ / profile_images \ / 575645183288610817 \ / 5vJNgPld_norm al.jpeg＆＃34;＆＃34; profile_image_url_https＆＃34;：＆＃34; HTTPS：\ / \ / pbs.twimg.com \ / profile_images \ / 575645183288610817 \ /5vJNgPld_normal.jpeg"，＆＃34; profile_link_color＆＃34;：＆＃34; 0084B4＆＃34;＆＃34; profile_sidebar_border_color＆＃34;：＆＃34; C0DEED＆＃34;＆＃34; profile_sidebar_fill_color＆＃34;：＆＃34; DDEEF6＆＃34 ;, ＆＃34; profile_text_color＆＃34;：＆＃34; 333333＆＃34;＆＃34; profile_use_background_image＆＃34;：真，＆＃34; has_extended_profile＆＃34;：假，＆＃34; DEFAULT_PROFILE＆＃34;：真＆＃34; default_profile_image＆＃34;：假，＆＃34;以下＆＃34;：假，＆＃34; follow_request_sent＆＃34;：假，＆＃34;通知＆＃34;：假}＆＃34;地理＆＃34;：空，＆＃34;坐标＆＃34;：空，＆＃34;地方＆＃34;：空，＆＃34;贡献者＆＃34;：空，＆＃34; retweeted_status＆＃34;：{ ＆＃34; created_at＆＃34;：＆＃34; Tue Oct 04 01:27:02 +0000 2016＆＃34;，＆＃34; id＆＃34;：783116185726357504，＆＃34; id_str＆＃34;：＆＃ 34; 783116185726357504＆＃34;

=============================================== =========================

Answer 1

这是一个完全正常的版本：

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import os
import re
import string

def clean(inputDir, outputFile):

    # print("Cleaning "+path)

    WRITE_HANDLER = open(outputFile, 'wb')
    tweets = dict()
    for line in open(inputDir + './project3.json','rb'):
        # print "Before" + line
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
            #print  "Writing new line"
            WRITE_HANDLER.write(new_line + '''''')
    return outputFile


DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
clean(DATA_FOLDER, CLEANED_DATA)

您可以通过以下方式调用它：

python clean.py inputDirectory outputFileName

Answer 2

您的代码存在一些问题：

您正在硬编码输入文件中的＆＃39; for line in open＆＃39;声明
您的输出文件名不会是clean.txt。它将是clean.txt，clean.txt ...将为您的目录中的每个文件创建一个
有一些奇怪的缩进
您发布的JSON全部在一行，因此被删除标点符号的语句删除了
您正在传递文件名，但是尝试根据该文件名遍历os文件系统。你应该通过：

python clean.py DIR_NAME CLEAN_FILE

修复缩进并美化JSON后，我得到了正确的输出：

def clean(path, filename):

    # print("Cleaning "+path)

    filename = CLEANED_DATA + filename.strip()
    print filename
    WRITE_HANDLER = open(filename, 'wb')
    tweets = dict()
    for line in open('./project3.json','rb'):
        # print "Before" + line
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
            #print  "Writing new line"
            WRITE_HANDLER.write(new_line + '''''')
    return filename

清除推文，不显示任何内容

2 个答案: