Question

我如何创建一个可以计算pdf中重复单词的AppleScript，然后将结果显示在层次结构中，其中最重复的单词位于顶部（带有计数），第二个最多的单词，依此类推等等？我想在学校使用它，所以在将ppt转换为pdf之后，我可以运行这个脚本来查看演示文稿中最重要的内容。

理想情况下，它会过滤掉以下单词：the，so，it等。

Answer 1

你要找的最后一部分很简单。

只需设置一个列表并检查单词是否在其中。

    set ignoreList to {"to", "is"}
    set reportFile to "/Users/USERNAME/Desktop/Word Frequencies.txt"
set theTextFile to "Users/USERNAME/Desktop/foo.txt")


set word_list to every word of (do shell script "cat " & quoted form of theTextFile)

    set word_frequency_list to {}

    repeat with the_word_ref in word_list
        set the_current_word to contents of the_word_ref
        if the_current_word is not in ignoreList then

            set word_info to missing value

            repeat with record_ref in word_frequency_list
                if the_word of record_ref = the_current_word then
                    set word_info to contents of record_ref
                    exit repeat
                end if
            end repeat

            if word_info = missing value then
                set word_info to {the_word:the_current_word, the_count:1}
                set end of word_frequency_list to word_info
            else
                set the_count of word_info to (the_count of word_info) + 1
            end if

        end if
    end repeat
    --return word_frequency_list

    set the_report_list to {}
    repeat with word_info in word_frequency_list
        set end of the_report_list to quote & the_word of word_info & ¬
            quote & "  - appears " & the_count of word_info & " times."
    end repeat

    set AppleScript's text item delimiters to return
    set the_report to the_report_list as text
    do shell script "echo  " & quoted form of the_report & " >  " & quoted form of reportFile
    set AppleScript's text item delimiters to ""
    delay 1
    do shell script " open   " & quoted form of reportFile

我还改变了一些代码，使用shell脚本来读/写文件。只是因为我更喜欢使用它而不是Textedit。

Answer 2

虽然它是可行的，如markhunte所示，但它非常慢。如果您正在处理更大的文本或大量文件，那么applescript非常慢。在我的测试中，我放弃了它。所以，这是一个简短的shell脚本，如果需要，可以从Apple脚本调用，这非常快。

#!/bin/sh

[ "$1" = "" ] || [ "$2" = "" ] && echo "$0 [wordsfile] [textfile]" && exit 1 

INFILE="$2"
WORDS="${2}.words"
EXWORDS="$1"

echo "File $INFILE has `cat $INFILE | wc -w ` words."
echo "Excluding the `cat $EXWORDS | wc -w` words."

echo "Extracting words from file and removing common words..."
grep -o -E '\w{3,}' $INFILE | grep -x -i -v -f $EXWORDS > $WORDS

echo "Top 10 most frequest words in $INFILE are..."
cat "$WORDS" | tr [:upper:] [:lower:] | sort | uniq -c | sort -rn | head -10

# Clean up
rm $WORDS

Applescript重复字数

2 个答案: