我正在尝试编写一个脚本来读取文件夹中的html文件的内容,擦除它们,然后将内容写入csv文件以便在另一个程序中进行分析。该脚本工作正常,只要html文件数量很少(HTML文件少于1000个),但是当html文件计数超过(在我的情况下我有大约9,000个文件要擦除)时,程序将崩溃。我认为问题是有关写csv的部分是错误的。该脚本在html文件夹中运行两次。这是第一次获得所有独特的单词。在此部分期间,内存使用量保持较低且相对恒定。在第二个循环通过html文件夹期间,内存使用情况升级为崩溃,我不知道为什么。我很感激任何建议。
import Foundation
let filemanager:FileManager = FileManager();
let path2folder:String = "My Documents/web";
let directory:String = NSHomeDirectory() + "/" + path2folder;
let files1 = filemanager.enumerator(atPath: directory);
let files2 = filemanager.enumerator(atPath: directory); //Because I loop twice through the folder
var count = 0;
var mime_count = 0;
var bag_of_words:[String] = [""]
var words_in_file:[String] = [""]
var csv_array:[String] = [""]
var file_array:[[String:Int]] = [[:]]
func trimWhiteSpace(array: [String]) -> [String]{
var trim: [String] = []
for word in array{
trim.append(word.trimmingCharacters(in: CharacterSet.whitespacesAndNewlines))
}
return trim;
}
func deleteDuplicateWords(array: [String]) -> [String] {
var found_words_set = Set<String>()
var unique: [String] = []
for word in array {
if found_words_set.contains(word) {
// Do not add a duplicate word.
}
else {
// Add word to the unique array.
found_words_set.insert(word)
// ... Append the word.
unique.append(word)
}
}
return unique
}
func deleteFlaggedWords(array: [String]) -> [String] {
var found_words_set = Set<String>()
found_words_set.insert("")
found_words_set.insert("|")
//...
var clean: [String] = []
for word in array {
if found_words_set.contains(word) {
// Do not add a duplicate word.
}
else {
// ... Append the word.
clean.append(word)
}
}
return clean
}
func deletePunctuation(array: [String]) -> [String]{
var scrubbed: [String] = []
for word in array {
var scrubbed_word = word
let first = word.characters.first!
if(first == "." ||
first == "," ||
first == "!" ||
first == "?" ||
first == ":" ||
first == ";" ||
first == ")" ||
first == "("){
scrubbed_word.remove(at: scrubbed_word.startIndex)
}
if(first == "\"" ||
first == "'"){
scrubbed_word.remove(at: scrubbed_word.startIndex)
//scrubbed_word = scrubbed_word.remove(at: scrubbed_word.startIndex)
}
let last = word.characters.last!
if(last == "." ||
last == "," ||
last == "!" ||
last == "?" ||
last == ":" ||
last == ";" ||
last == ")" ||
last == "("){
scrubbed_word = word.substring(to: word.index(before: word.endIndex))
}
if(last == "\"" ||
last == "'"){
scrubbed_word = word.substring(to: word.index(before: word.endIndex))
//scrubbed_word = scrubbed_word.substring(to: scrubbed_word.index(before: scrubbed_word.endIndex))
}
scrubbed.append(scrubbed_word)
}
return scrubbed
}
func sortWordArray(array: [String]) -> [String]{
var sorted_array:[String]
sorted_array = array.sorted { $0.localizedCaseInsensitiveCompare($1) == ComparisonResult.orderedAscending }
return sorted_array
}
print("Processing...")
/**
* Loop through all of the files in the folder
* To find the all of the words used in all of the file
**/
while let file = files1?.nextObject(){
/**
* if you find a file, pass the location of the file to the dir variable
**/
if var dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first {
dir = dir.deletingLastPathComponent()
var path_st = path2folder + "/" + String(describing: file)
var path = dir.appendingPathComponent(path_st)
//reading html file
do {
var html_file_contents = try String(contentsOf: path, encoding: String.Encoding.utf8)
count = count + 1;
words_in_file = html_file_contents.components(separatedBy: CharacterSet.whitespacesAndNewlines)
words_in_file = trimWhiteSpace(array: words_in_file)
words_in_file = deleteFlaggedWords(array: words_in_file)
words_in_file = deletePunctuation(array: words_in_file)
bag_of_words = bag_of_words + words_in_file
}
catch {
/* error handling here */
}
}
}
//OK, HERE'S ALL THE UNIQUE 'WORDS' IN ALL HTML FILES
bag_of_words = trimWhiteSpace(array: bag_of_words)
bag_of_words = deleteDuplicateWords(array: bag_of_words)
bag_of_words = deleteFlaggedWords(array: bag_of_words)
bag_of_words = deletePunctuation(array: bag_of_words)
bag_of_words = sortWordArray(array: bag_of_words)
bag_of_words = deleteDuplicateWords(array: bag_of_words) //Just do once more
/**
* Now that you have the word count array
* Write word count array to a csv file
**/
let csv_file_name = "data.csv"
let csv_path = URL(fileURLWithPath: NSHomeDirectory()).appendingPathComponent(csv_file_name)
//Create the First line in the CSV
var csv_text = ""
for w in bag_of_words{
csv_text = csv_text + "\(w),"
}
csv_text = csv_text + "\n"
do{
try csv_text.write(to: csv_path, atomically: true, encoding: String.Encoding.utf8)
}
catch{
print("\(error)")
}
/**
* Now create the 2-D Array that will track the word count
* Of each file and populate it with the word counts
**/
count = 0
mime_count = 0
while let file = files2?.nextObject(){
/**
* if you find a file, pass the location of the file to the dir variable
**/
if var dir = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first {
dir = dir.deletingLastPathComponent()
var path_st = path2folder + "/" + String(describing: file)
var path = dir.appendingPathComponent(path_st)
//reading html file
do {
var html_file_contents = try String(contentsOf: path, encoding: String.Encoding.utf8)
count = count + 1;
words_in_file = html_file_contents.components(separatedBy: CharacterSet.whitespacesAndNewlines)
words_in_file = trimWhiteSpace(array: words_in_file)
words_in_file = deleteFlaggedWords(array: words_in_file)
words_in_file = deletePunctuation(array: words_in_file)
var counts:[String:Int] = [:]
//Zero Out Counts
for (index, element) in bag_of_words.enumerated(){
counts[element] = 0
}
//COUNT WORDS FOR THIS FILE
let bag_of_words_set = Set(bag_of_words)
for (index, element) in words_in_file.enumerated(){
if(counts[element] != nil){
counts[element] = counts[element]! + 1
}
}
csv_text = ""
do {
for c in counts{
csv_text = csv_text + "\(c.value),"
}
csv_text.remove(at: csv_text.index(before: csv_text.endIndex))
csv_text = csv_text + "\n"
let csv_data = csv_text.data(using: String.Encoding.utf8)!
//I THINK THIS IS WHERE THE MEMORY IS GOING CRAZY
if(FileManager.default.fileExists(atPath: csv_path.path)){
do{
let file_handle = try FileHandle(forUpdating: csv_path)
file_handle.seekToEndOfFile()
file_handle.write(csv_data)
file_handle.closeFile()
}
catch{
print(error)
}
}
}
catch {/* error handling here */
print("\(error)")
}
}
catch {
/* error handling here */
}
}
}