我已经实现了(外部)自然平衡双向合并排序,用于对文本文件进行排序(文本文件包含用换行符分隔的字符串)。该算法可以按预期工作,但是我要改进其中的一部分。首先,该算法分为两部分:第一部分分配输入数据,第二部分合并它。该算法的第一部分工作良好,但第二部分的实现方式不正确。这里的问题是,借助getNextStringRunLength
函数,可以从两个临时文件中预先读取下一次运行的时间。
例如:
int n = getNextStringRunLength(temp_file_1)
int m = getNextStringRunLength(temp_file_2)
该算法合并数据,直到将(n + m)个值合并到长度为(n + m)的新行程中。这里的想法是在运行中合并数据,而不读取之前的游程长度。这里的主要问题是如何即时执行合并过程?
input.txt的示例
汽车
电机
自行车
书
约翰
卡尔
output.txt的样本
自行车
书
汽车
卡尔
约翰
电机
请参见下面的代码,感谢您的回答!
/**
* Sorts input text file of strings using natural merge procedure with a help of two auxiliary files.
* In the first part it uses these files to distribute input data and then merges them in the second part.
* Input data is distributed using <i>writeNextStringRun</i> method which writes next string run
* to auxiliary file. When it merges runs to a length of (m+n) it uses function<i>getNextStringRunLength</i>
* which returns length of a next string run found in auxiliary files and thus helping to merge run of length m
* and run of length n into a single run of length (m+n).
* <p>
* Distribute and merge procedure repeat until all the data is sorted in ascending order. The algorithm
* produces a brand new output file which contains sorted data and thus retains input file unchanged.
* Since the algorithm is balanced it changes the role of input/output files after merge part ends. At the
* end of algorithm all empty auxiliary files are deleted from the local file system, the auxiliary file which
* contains sorted data becomes output file of the algorithm.
* <p>
* @param temp_files number of auxiliary files used for data distribution and merging.
* @param working_dir path to local directory where all the sorting takes place.
* @param main_string_file local text file which contains all input data separated by new a line character.
* @throws IOException if an input or output exception occurred during file operations.
*/
public static void naturalBalancedTwoWayMergeSortStrings(int temp_files, String working_dir, File main_string_file) throws IOException
{
long data_read = 0;
String last_string_runs = new String[temp_files];
int i,j;
int n,m;
int rw_switch = 0;
int run_counter = 0;
String line_1 = null;
String line_2 = null;
int file_write_pointer;
long output_file_length = 0;
String file_extension = ".txt";
File input_files[] = new File[temp_files];
File output_files[] = new File[temp_files];
String input_file_name = "input_string_file_";
String output_file_name = "output_string_file_";
File sorted_file = new File(working_dir + "/main_string_sorted" + file_extension);
BufferedWriter input_file_writers[] = new BufferedWriter[temp_files];
BufferedReader input_file_readers[] = new BufferedReader[temp_files];
BufferedReader run_length_readers[] = new BufferedReader[temp_files];
BufferedWriter output_file_writers[] = new BufferedWriter[temp_files];
BufferedReader main_file_reader = new BufferedReader(new FileReader(main_string_file));
for(int p=0; p<temp_files; p++)
{
input_files[p] = new File(working_dir + input_file_name + (p+1) + file_extension);
output_files[p] = new File(working_dir + output_file_name + (p+1) + file_extension);
input_file_writers[p] = new BufferedWriter(new FileWriter(input_files[p],true));
run_length_readers[p] = new BufferedReader(new FileReader(input_files[p]));
}
/* START - initial run distribution */
long main_file_length = main_string_file.length();
while(data_read < main_file_length)
{
writeNextStringRun(main_file_reader,input_file_writers[run_counter%2]);
run_counter++;
}
main_file_reader.close();
input_file_writers[0].close();
input_file_writers[1].close();
/* END - initial run distribution. */
/* START - merge all runs. */
do
{
for(int r=0; r<temp_files; r++)
{
input_file_readers[r] = new BufferedReader(new FileReader(input_files[r]));
run_length_readers[r] = new BufferedReader(new FileReader(input_files[r]));
output_file_writers[r] = new BufferedWriter(new FileWriter(output_files[r],true));
}
/* >>> This is not the righ approach <<< */
n = getNextStringRunLength(run_length_readers[0],0);
m = getNextStringRunLength(run_length_readers[1],1);
try
{
line_1 = input_file_readers[0].readLine();
}
catch(Exception e){}
try
{
line_2 = input_file_readers[1].readLine();
}
catch(Exception e){}
file_write_pointer = 0;
while(n > 0 || m > 0)
{
i = 1;
j = 1;
while((line_1 != null && i <= n) && (line_2 != null && j <= m))
{
if(line_1.compareTo(line_2) < 0)
{
output_file_writers[file_write_pointer%2].write(line_1 + "\n");
try
{
line_1 = input_file_readers[0].readLine();
}
catch(Exception e){}
i++;
}
else
{
output_file_writers[file_write_pointer%2].write(line_2 + "\n");
try
{
line_2 = input_file_readers[1].readLine();
}
catch(Exception e){}
j++;
}
}
while(line_1 != null && i <= n)
{
output_file_writers[file_write_pointer%2].write(line_1 + "\n");
try
{
line_1 = input_file_readers[0].readLine();
}
catch(Exception e){}
i++;
}
while(line_2 != null && j <= m)
{
output_file_writers[file_write_pointer%2].write(line_2 + "\n");
try
{
line_2 = input_file_readers[1].readLine();
}
catch(Exception e){}
j++;
}
file_write_pointer++;
/* >>> This is not the righ approach <<< */
n = getNextStringRunLength(run_length_readers[0],0);
m = getNextStringRunLength(run_length_readers[1],1);
}
/* END - merge all runs. */
for(int k=0; k<temp_files; k++)
{
input_file_readers[k].close();
run_length_readers[k].close();
output_file_writers[k].close();
}
/* START - swap io files. */
File temp_store[] = new File[temp_files];
switch(rw_switch % 2)
{
case 0:
output_file_length = output_files[0].length();
for(int k=0; k<temp_files; k++)
{
input_file_writers[k] = new BufferedWriter(new FileWriter(input_files[k]));
temp_store[k] = input_files[k];
input_files[k] = output_files[k];
output_files[k] = temp_store[k];
}
break;
case 1:
output_file_length = output_files[0].length();
for(int k=0; k<temp_files; k++)
{
temp_store[k] = output_files[k];
output_files[k] = input_files[k];
input_files[k] = temp_store[k];
output_file_writers[k] = new BufferedWriter(new FileWriter(output_files[k]));
}
break;
}
/* END - swap io files. */
rw_switch++;
}
while(main_file_length > output_file_length);
/* END - merge all runs. */
}
/**
* Writes next string run to auxiliary file.
*
* @param input_file_reader a reader which reads strings from input text file.
* @param input_writer a writer which writes next string run to a auxiliary file.
* @throws IOException if an input or output exception occurred during file operations.
*/
private static void writeNextStringRun(BufferedReader input_file_reader, BufferedWriter input_writer) throws IOException
{
try
{
if(last_string_run != null)
{
input_writer.write(last_string_run + "\n");
last_string_run = null;
}
String min_value = "";
String current_line = input_file_reader.readLine();
while(current_line != null)
{
if(current_line.compareTo(min_value) >= 0)
{
input_writer.write(current_line + "\n");
data_read += current_line.length() + 1;
min_value = current_line;
current_line = input_file_reader.readLine();
}
else
{
last_string_run = current_line;
return;
}
}
}
catch(Exception e){}
}
/**
* Returns length of next string run in auxiliary string file.
*
* @param input_file_reader a reader which reads strings from auxiliary input file.
* @param input_file_index an index of auxiliary file from which it reads next string run length.
* This parameter is used in <i>last_string_runs</i> array which contains last runs read from
* a file with this parameter (index).
* @return next string run length found in auxiliary string file.
* @throws IOException if an input or output exception occurred during file operations.
*/
private static int getNextStringRunLength(BufferedReader input_file_reader, int input_file_index) throws IOException
{
int run_length = 0;
try
{
if(last_string_runs[input_file_index] != null)
{
run_length++;
}
String min_value = "";
String current_line = input_file_reader.readLine();
if(run_length > 0)
{
min_value = last_string_runs[input_file_index];
}
while(current_line != null)
{
if(current_line.compareTo(min_value) >= 0)
{
run_length++;
min_value = current_line;
current_line = input_file_reader.readLine();
}
else
{
last_string_runs[input_file_index] = current_line;
return run_length;
}
}
}
catch(Exception e){}
last_string_runs[input_file_index] = null;
return run_length;
}