Question

我正在尝试编写一个脚本来读取文件夹中的html文件的内容，擦除它们，然后将内容写入csv文件以便在另一个程序中进行分析。该脚本工作正常，只要html文件数量很少（HTML文件少于1000个），但是当html文件计数超过（在我的情况下我有大约9,000个文件要擦除）时，程序将崩溃。我认为问题是有关写csv的部分是错误的。该脚本在html文件夹中运行两次。这是第一次获得所有独特的单词。在此部分期间，内存使用量保持较低且相对恒定。在第二个循环通过html文件夹期间，内存使用情况升级为崩溃，我不知道为什么。我很感激任何建议。

    import Foundation

    let filemanager:FileManager = FileManager();
    let path2folder:String = "My Documents/web";
    let directory:String = NSHomeDirectory() + "/" + path2folder;
    let files1 = filemanager.enumerator(atPath: directory);
    let files2 = filemanager.enumerator(atPath: directory); //Because I loop twice through the folder
    var count = 0;
    var mime_count = 0;

    var bag_of_words:[String] = [""]
    var words_in_file:[String] = [""]
    var csv_array:[String] = [""]
    var file_array:[[String:Int]] = [[:]]

    func trimWhiteSpace(array: [String]) -> [String]{
        var trim: [String] = []
        for word in array{
            trim.append(word.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
        }
        return trim;
    }
    func deleteDuplicateWords(array: [String]) -> [String] {
        var found_words_set = Set<String>()
        var unique: [String] = []
        for word in array {
            if found_words_set.contains(word) {
                // Do not add a duplicate word.
            }
            else {
                // Add word to the unique array.
                found_words_set.insert(word)
                // ... Append the word.
                unique.append(word)
            }
        }
        return unique
    }
    func deleteFlaggedWords(array: [String]) -> [String] {
        var found_words_set = Set<String>()
        found_words_set.insert("")
        found_words_set.insert("|")
        //...

        var clean: [String] = []
        for word in array {
            if found_words_set.contains(word) {
                // Do not add a duplicate word.
            }
            else {
                // ... Append the word.
                clean.append(word)
            }
        }
        return clean
    }
    func deletePunctuation(array: [String]) -> [String]{
        var scrubbed: [String] = []

        for word in array {
            var scrubbed_word = word

            let first = word.characters.first!
            if(first == "." ||
                first == "," ||
                first == "!" ||
                first == "?" ||
                first == ":" ||
                first == ";" ||
                first == ")" ||
                first == "("){
                scrubbed_word.remove(at: scrubbed_word.startIndex)
            }
            if(first == "\"" ||
                first == "'"){
                 scrubbed_word.remove(at: scrubbed_word.startIndex)
                //scrubbed_word = scrubbed_word.remove(at: scrubbed_word.startIndex)
            }

            let last = word.characters.last!
            if(last == "." ||
                last == "," ||
                last == "!" ||
                last == "?" ||
                last == ":" ||
                last == ";" ||
                last == ")" ||
                last == "("){
                scrubbed_word = word.substring(to: word.index(before: word.endIndex))
            }
            if(last == "\"" ||
                last == "'"){
                scrubbed_word = word.substring(to: word.index(before: word.endIndex))
                //scrubbed_word = scrubbed_word.substring(to: scrubbed_word.index(before: scrubbed_word.endIndex))
            }
            scrubbed.append(scrubbed_word)

        }
        return scrubbed
    }
    func sortWordArray(array: [String]) -> [String]{
        var sorted_array:[String]

        sorted_array = array.sorted { $0.localizedCaseInsensitiveCompare($1) == ComparisonResult.orderedAscending     }
        return sorted_array
    }

    print("Processing...")

    /**
     * Loop through all of the files in the folder
     * To find the all of the words used in all of the file
     **/
    while let file = files1?.nextObject(){
        /**
         * if you find a file, pass the location of the file to the dir variable
         **/
        if var dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first {
            dir = dir.deletingLastPathComponent()
            var path_st = path2folder + "/" + String(describing: file)
            var path = dir.appendingPathComponent(path_st)

            //reading html file
            do {
                var html_file_contents = try String(contentsOf: path, encoding: String.Encoding.utf8)
                count = count + 1;

                words_in_file = html_file_contents.components(separatedBy: CharacterSet.whitespacesAndNewlines)

                words_in_file = trimWhiteSpace(array: words_in_file)

                words_in_file = deleteFlaggedWords(array: words_in_file)

                words_in_file = deletePunctuation(array: words_in_file)

                bag_of_words = bag_of_words + words_in_file
            }
            catch {
                /* error handling here */
            }
        }
    }

    //OK, HERE'S ALL THE UNIQUE 'WORDS' IN ALL HTML FILES
    bag_of_words = trimWhiteSpace(array: bag_of_words)
    bag_of_words = deleteDuplicateWords(array: bag_of_words)
    bag_of_words = deleteFlaggedWords(array: bag_of_words)
    bag_of_words = deletePunctuation(array: bag_of_words)
    bag_of_words = sortWordArray(array: bag_of_words)
    bag_of_words = deleteDuplicateWords(array: bag_of_words) //Just do once more

    /**
     * Now that you have the word count array
     * Write word count array to a csv file
     **/
    let csv_file_name = "data.csv"
    let csv_path = URL(fileURLWithPath:     NSHomeDirectory()).appendingPathComponent(csv_file_name)
    //Create the First line in the CSV
    var csv_text = ""
    for w in bag_of_words{
        csv_text = csv_text + "\(w),"
    }    
    csv_text = csv_text + "\n"

    do{
        try csv_text.write(to: csv_path, atomically: true, encoding: String.Encoding.utf8)
    }
    catch{
        print("\(error)")
    }

    /**
     * Now create the 2-D Array that will track the word count
     * Of each file and populate it with the word counts
     **/
    count = 0
    mime_count = 0
    while let file = files2?.nextObject(){

        /**
         * if you find a file, pass the location of the file to the dir variable
         **/
        if var dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first {

            dir = dir.deletingLastPathComponent()
            var path_st = path2folder + "/" + String(describing: file)
            var path = dir.appendingPathComponent(path_st)

            //reading html file
            do {
                var html_file_contents = try String(contentsOf: path, encoding: String.Encoding.utf8)
                count = count + 1;

                words_in_file = html_file_contents.components(separatedBy: CharacterSet.whitespacesAndNewlines)

                words_in_file = trimWhiteSpace(array: words_in_file)
                words_in_file = deleteFlaggedWords(array: words_in_file)
                words_in_file = deletePunctuation(array: words_in_file)

                var counts:[String:Int] = [:]

                //Zero Out Counts
                for (index, element) in bag_of_words.enumerated(){
                    counts[element] = 0
                }

                //COUNT WORDS FOR THIS FILE
                let bag_of_words_set = Set(bag_of_words)
                for (index, element) in words_in_file.enumerated(){
                    if(counts[element] != nil){
                        counts[element] = counts[element]! + 1
                    }
                }
                csv_text = ""
                do {
                    for c in counts{
                        csv_text = csv_text + "\(c.value),"
                    }
                    csv_text.remove(at: csv_text.index(before: csv_text.endIndex))
                    csv_text = csv_text + "\n"
                    let csv_data = csv_text.data(using: String.Encoding.utf8)!
                    //I THINK THIS IS WHERE THE MEMORY IS GOING CRAZY
                    if(FileManager.default.fileExists(atPath: csv_path.path)){
                        do{
                            let file_handle = try FileHandle(forUpdating: csv_path)
                            file_handle.seekToEndOfFile()
                            file_handle.write(csv_data)
                            file_handle.closeFile()
                        }
                        catch{
                            print(error)
                        }
                    }
                }
                catch {/* error handling here */
                    print("\(error)")
                }
            }
            catch {
            /* error handling here */
        }
    }
}

Swift命令行写入文件会导致内存崩溃

0 个答案: