
时间:2012-11-11 15:09:38

标签: perl file stop-words


use Lingua::StopWords qw(getStopWords);
my $stopwords = getStopWords('en');

@files = <*>;

foreach $file (@files) 
    open (input, $file);

    while (<input>) 
        open (output,">>c:/perl/normalized/".$file);
    #####What should I write here to remove the stop words#####
    $_ =~s/<[^>]*>//g;
    $_ =~ s/\s\.//g;
    $_ =~ s/[[:punct:]]\.//g;
    if($_ =~ m/(\w{4,})\./)
    $_ =~ s/\.//g;
    $_ =~ s/^\.//g;
    $_ =~ s/,/' '/g;
    $_ =~ s/\(||\)||\\||\/||-||\'//g;

    print output "$_\n";


close (input);
close (output);

2 个答案:

答案 0 :(得分:2)


@stopwords = grep { $stopwords->{$_} } (keys %$stopwords);



# remove all occurrences of @stopwords from $_

for my $w (@stopwords) {


我们还使用\b来匹配单词边界。这有助于确保我们不会在另一个单词的中间出现停止词。希望这对你有用 - 这很大程度上取决于你的输入文字是什么样的 - 即你是否有标点字符等。

答案 1 :(得分:0)

# Always use these in your Perl programs.
use strict;
use warnings;

use File::Basename qw(basename);
use Lingua::StopWords qw(getStopWords);

# It's often better to build scripts that take their input
# and output locations as command-line arguments rather than
# being hard-coded in the program.
my $input_dir   = shift @ARGV;
my $output_dir  = shift @ARGV;
my @input_files = glob "$input_dir/*";

# Convert the hash ref of stop words to a regular array.
# Also quote any regex characters in the stop words.
my @stop_words  = map quotemeta, keys %{getStopWords('en')};

for my $infile (@input_files){
    # Open both input and output files at the outset.
    # Your posted code reopened the output file for each line of input.
    my $fname   = basename $infile;
    my $outfile = "$output_dir/$fname";
    open(my $fh_in,  '<', $infile)  or die "$!: $infile";
    open(my $fh_out, '>', $outfile) or die "$!: $outfile";

    # Process the data: you need to iterate over all stop words
    # for each line of input.
    while (my $line = <$fh_in>){
        $line =~ s/\b$_\b//ig for @stop_words;
        print $fh_out $line;

    # Close the files within the processing loop, not outside of it.
    close $fh_in;
    close $fh_out;