清除推文,不显示任何内容

时间:2017-05-15 14:48:11

标签: python regex twitter tweets

我试图运行下面的代码来清理txt文件中的一组推文

我也在命令行中定义了参数,但似乎没有输出

知道我可能做错了吗?

以下是代码:

代码:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import os
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import pos_tag


def clean(path, filename):

    # print("Cleaning "+path)

    filename = CLEANED_DATA + filename.strip()
    WRITE_HANDLER = open(filename, 'wb')
    tweets = dict()
    for line in open('/Users/Mustafa/Desktop/nexalogy/project3.txt',
                 'rb'):
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line,
                  flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i \
                not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
           WRITE_HANDLER.write(new_line + '''

''')
    return filename


DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
for (root, dirs, files) in os.walk(DATA_FOLDER):  # gets all the files from 
subfolders recrsively
    for name in files:
        absolute_path = os.path.join(root, name)
        if os.path.isfile(absolute_path) and name != '.DS_Store':
        filename = clean(absolute_path, name)

文件:Project3.txt

{" created_at":" Tue Oct 04 17:16:30 +0000 2016"," id":783355126945722368," id_str":" 783355126945722368"," text":" RT @Jacquiecharles:USAID为人道主义合作伙伴(不是GOH)提供400,000美元的初步援助,以迅速提供重要的救济。 \ U2026""截断":假,"实体" {"#标签":[],"符号":[] ," user_mentions":[{" screen_name":" Jacquiecharles"," name":" Jacqueline Charles",& #34; ID":15360434," ID_STR":" 15360434""指数":[3,18]}],"网址& #34;:[]},"元数据" {" iso_language_code":"恩"" result_type的":"最近& #34;},"来源":" Twitter for iPhone< \ / a>"," in_reply_to_status_id":null," in_reply_to_status_id_str" :空," in_reply_to_user_id":空," in_reply_to_user_id_str":空," in_reply_to_screen_name":空,"使用者" {&#3 4; id":635031678," id_str":" 635031678"," name":" Tracie Hamilton"," screen_name":" TracieHamilton8"," location":""," description":" Leaning&每天取决于他"," url":null," entity":{" description":{" urls":[] }},"保护":假," FOLLOWERS_COUNT":1929," FRIENDS_COUNT":715," listed_count":63,#34& ; created_at":" Fri Jul 13 23:39:46 +0000 2012"," favourites_count":27603," utc_offset":null,&#34 ;的time_zone":空," geo_enabled":真,"验证":假," statuses_count":17433"朗&#34 ;: "恩"" contributors_enabled":假," is_translator":假," is_translation_enabled":假," profile_background_color" :" C0DEED"" profile_background_image_url":" HTTP:\ / \ / abs.twimg.com \ /图像\ /主题\ / THEME1 \ /bg.png" ;," profile_background_image_url_https":" HTTPS:\ / \ / abs.twimg.com \ /图像\ /主题\ / THEME1 \ /bg.png"," profile_background_tile&# 34;:假," profile_image_url":" HTTP:\ / \ / pbs.twimg.com \ / profile_images \ / 575645183288610817 \ / 5vJNgPld_norm al.jpeg"" profile_image_url_https":" HTTPS:\ / \ / pbs.twimg.com \ / profile_images \ / 575645183288610817 \ /5vJNgPld_normal.jpeg"," profile_link_color":" 0084B4"" profile_sidebar_border_color":" C0DEED"" profile_sidebar_fill_color":" DDEEF6&#34 ;, " profile_text_color":" 333333"" profile_use_background_image":真," has_extended_profile":假," DEFAULT_PROFILE":真" default_profile_image":假,"以下":假," follow_request_sent":假,"通知":假}"地理":空,"坐标":空,"地方":空,"贡献者":空," retweeted_status":{ " created_at":" Tue Oct 04 01:27:02 +0000 2016"," id":783116185726357504," id_str":&# 34; 783116185726357504"

=============================================== =========================

2 个答案:

答案 0 :(得分:3)

这是一个完全正常的版本:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import os
import re
import string

def clean(inputDir, outputFile):

    # print("Cleaning "+path)

    WRITE_HANDLER = open(outputFile, 'wb')
    tweets = dict()
    for line in open(inputDir + './project3.json','rb'):
        # print "Before" + line
        line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers

        new_line = ''
        for i in line.split():  # remove @ and #words, punctuataion
            if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
                new_line += i + ' '
        line = new_line

        # # Do sentence correction

        if new_line in tweets:
            continue
        else:
            tweets[new_line] = 1
        if len(new_line.strip()) > 0:
            #print  "Writing new line"
            WRITE_HANDLER.write(new_line + '''''')
    return outputFile


DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
clean(DATA_FOLDER, CLEANED_DATA)

您可以通过以下方式调用它:

python clean.py inputDirectory outputFileName

答案 1 :(得分:1)

您的代码存在一些问题:

  1. 您正在硬编码输入文件中的' for line in open'声明
  2. 您的输出文件名不会是clean.txt。它将是clean.txt,clean.txt ...将为您的目录中的每个文件创建一个
  3. 有一些奇怪的缩进
  4. 您发布的JSON全部在一行,因此被删除标点符号的语句删除了
  5. 您正在传递文件名,但是尝试根据该文件名遍历os文件系统。你应该通过:

    python clean.py DIR_NAME CLEAN_FILE

  6. 修复缩进并美化JSON后,我得到了正确的输出:

    def clean(path, filename):
    
        # print("Cleaning "+path)
    
        filename = CLEANED_DATA + filename.strip()
        print filename
        WRITE_HANDLER = open(filename, 'wb')
        tweets = dict()
        for line in open('./project3.json','rb'):
            # print "Before" + line
            line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE)  # removes the characters specified
            line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
            line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE)  # remove link
            line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
            line = filter(lambda x: x in string.printable, line)  # filter non-ascii characers
    
            new_line = ''
            for i in line.split():  # remove @ and #words, punctuataion
                if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
                    new_line += i + ' '
            line = new_line
    
            # # Do sentence correction
    
            if new_line in tweets:
                continue
            else:
                tweets[new_line] = 1
            if len(new_line.strip()) > 0:
                #print  "Writing new line"
                WRITE_HANDLER.write(new_line + '''''')
        return filename