如何执行文本文件的二进制搜索

时间:2012-04-04 11:27:13

标签: java android binary-search

我有一个大文本文件(5Mb),我在Android应用程序中使用。我将文件创建为预先排序的字符串列表,文件创建后不会更改。如何对该文件的内容执行二进制搜索,而无需逐行读取以查找匹配的字符串?

6 个答案:

答案 0 :(得分:6)

由于文件内容没有变化,您可以将文件分成多个部分。说A-G,H-N,0-T和U-Z。这允许您检查第一个字符,并立即将可能的设置剪切为原始大小的四分之一。现在,线性搜索不会花费很长时间,或者读取整个文件可能是一个选项。如果n / 4仍然太大,这个过程可以延长,但想法是一样的。将搜索细分构建到文件结构中,而不是尝试在内存中完成所有操作。

答案 1 :(得分:1)

5MB的文件不是那么大 - 您应该能够将每一行读入String[]数组,然后您可以使用java.util.Arrays.binarySearch()来查找所需的行。这是我推荐的方法。

如果您不想将整个文件读入您​​的应用,那么它会变得更加复杂。如果文件的每一行长度相同,并且文件已经排序,那么您可以在RandomAccessFile中打开该文件,并使用seek()这样自己执行二进制搜索......

// open the file for reading
RandomAccessFile raf = new RandomAccessFile("myfile.txt","r");
String searchValue = "myline";
int lineSize = 50;
int numberOfLines = raf.length() / lineSize;

// perform the binary search...
byte[] lineBuffer = new byte[lineSize];
int bottom = 0;
int top = numberOfLines;
int middle;
while (bottom <= top){
  middle = (bottom+top)/2;
  raf.seek(middle*lineSize); // jump to this line in the file
  raf.read(lineBuffer); // read the line from the file
  String line = new String(lineBuffer); // convert the line to a String

  int comparison = line.compareTo(searchValue);
  if (comparison == 0){
    // found it
    break;
    }
  else if (comparison < 0){
    // line comes before searchValue
    bottom = middle + 1;
    }
  else {
    // line comes after searchValue
    top = middle - 1;
    }
  }

raf.close(); // close the file when you're finished

但是,如果文件没有固定宽度的行,那么你不能轻易地执行二进制搜索而不先将其加载到内存中,因为你不能像你一样快速跳转到文件中的特定行可以使用固定宽度的线条。

答案 2 :(得分:1)

在统一的字符长度文本文件中,您可以在字符中查找有问题的区间的中间,开始读取字符,直到您点击分隔符,然后使用后续字符串作为元素中间的近似值。但是,在Android中执行此操作的问题是,您显然不能get random access to a resource(虽然我想您每次都可以重新打开它)。此外,这种技术并未概括为地图和其他类型的集合。

另一种选择是(使用RandomAccessFile)在文件的开头写一个整数的“数组” - 每个字符串一个 - 然后返回并用它们相应字符串的位置更新它们。再次搜索将需要跳转。

我会做什么(并且在我自己的应用程序中做过)是在文件中实现hash set。这个与树木分开链接。

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.Set;

class StringFileSet {

    private static final double loadFactor = 0.75;

    public static void makeFile(String fileName, String comment, Set<String> set) throws IOException {
        new File(fileName).delete();
        RandomAccessFile fout = new RandomAccessFile(fileName, "rw");

        //Write comment
        fout.writeUTF(comment);

        //Make bucket array
        int numBuckets = (int)(set.size()/loadFactor);

        ArrayList<ArrayList<String>> bucketArray = new ArrayList<ArrayList<String>>(numBuckets);
        for (int ii = 0; ii < numBuckets; ii++){
            bucketArray.add(new ArrayList<String>());
        }

        for (String key : set){
            bucketArray.get(Math.abs(key.hashCode()%numBuckets)).add(key);
        }

        //Sort key lists in preparation for creating trees
        for (ArrayList<String> keyList : bucketArray){
            Collections.sort(keyList);
        }

        //Make queues in preparation for creating trees
        class NodeInfo{

            public final int lower;
            public final int upper;
            public final long callingOffset;

            public NodeInfo(int lower, int upper, long callingOffset){
                this.lower = lower;
                this.upper = upper;
                this.callingOffset = callingOffset;
            }

        }

        ArrayList<LinkedList<NodeInfo>> queueList = new ArrayList<LinkedList<NodeInfo>>(numBuckets);
        for (int ii = 0; ii < numBuckets; ii++){
            queueList.add(new LinkedList<NodeInfo>());
        }

        //Write bucket array
        fout.writeInt(numBuckets);
        for (int index = 0; index < numBuckets; index++){
            queueList.get(index).add(new NodeInfo(0, bucketArray.get(index).size()-1, fout.getFilePointer()));
            fout.writeInt(-1);
        }

        //Write trees
        for (int bucketIndex = 0; bucketIndex < numBuckets; bucketIndex++){
            while (queueList.get(bucketIndex).size() != 0){
                NodeInfo nodeInfo = queueList.get(bucketIndex).poll();
                if (nodeInfo.lower <= nodeInfo.upper){
                    //Set respective pointer in parent node
                    fout.seek(nodeInfo.callingOffset);
                    fout.writeInt((int)(fout.length() - (nodeInfo.callingOffset + 4))); //Distance instead of absolute position so that the get method can use a DataInputStream
                    fout.seek(fout.length());

                    int middle = (nodeInfo.lower + nodeInfo.upper)/2;

                    //Key
                    fout.writeUTF(bucketArray.get(bucketIndex).get(middle));

                    //Left child
                    queueList.get(bucketIndex).add(new NodeInfo(nodeInfo.lower, middle-1, fout.getFilePointer()));
                    fout.writeInt(-1);

                    //Right child
                    queueList.get(bucketIndex).add(new NodeInfo(middle+1, nodeInfo.upper, fout.getFilePointer()));
                    fout.writeInt(-1);
                }
            }
        }

        fout.close();
    }

    private final String fileName;
    private final int numBuckets;
    private final int bucketArrayOffset;

    public StringFileSet(String fileName) throws IOException {
        this.fileName = fileName;

        DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(fileName)));

        short numBytes = fin.readShort();
        fin.skipBytes(numBytes);
        this.numBuckets = fin.readInt();
        this.bucketArrayOffset = numBytes + 6;

        fin.close();
    }

    public boolean contains(String key) throws IOException {
        boolean containsKey = false;

        DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(this.fileName)));

        fin.skipBytes(4*(Math.abs(key.hashCode()%this.numBuckets)) + this.bucketArrayOffset);

        int distance = fin.readInt();
        while (distance != -1){
            fin.skipBytes(distance);

            String candidate = fin.readUTF();
            if (key.compareTo(candidate) < 0){
                distance = fin.readInt();
            }else if (key.compareTo(candidate) > 0){
                fin.skipBytes(4);
                distance = fin.readInt();
            }else{
                fin.skipBytes(8);
                containsKey = true;
                break;
            }
        }

        fin.close();

        return containsKey;
    }

}

测试程序

import java.io.File;
import java.io.IOException;
import java.util.HashSet;

class Test {
    public static void main(String[] args) throws IOException {
        HashSet<String> stringMemorySet = new HashSet<String>();

        stringMemorySet.add("red");
        stringMemorySet.add("yellow");
        stringMemorySet.add("blue");

        StringFileSet.makeFile("stringSet", "Provided under ... included in all copies and derivatives ...", stringMemorySet);
        StringFileSet stringFileSet = new StringFileSet("stringSet");

        System.out.println("orange -> " + stringFileSet.contains("orange"));
        System.out.println("red -> " + stringFileSet.contains("red"));
        System.out.println("yellow -> " + stringFileSet.contains("yellow"));
        System.out.println("blue -> " + stringFileSet.contains("blue"));

        new File("stringSet").delete();

        System.out.println();
    }
}

如果你为android修改它,你还需要pass a Context,所以它可以访问getResources()方法。

如果您正在使用GUI,那么您可能还需要stop the android build tools from compressing the file,这显然只能完成 - 通过将文件的扩展名更改为jpg等内容。这使得我的应用程序的处理速度提高了大约100到300倍。

您也可以使用giving yourself more memory来查看NDK

答案 3 :(得分:0)

这是我快速整理的东西。它使用两个文件,一个带有单词,另一个带有偏移量。偏移文件的格式是这样的:前10位包含字大小,后22位包含偏移(字位置,例如,aaah将为0,abasementable将为4等)。它以big endian(java标准)编码。希望它对某人有所帮助。

word.dat:

aaahabasementableabnormalabnormalityabortionistabortion-rightsabracadabra

wordx.dat:

00 80 00 00 01 20 00 04 00 80 00 0D 01 00 00 11   _____ __________
01 60 00 19 01 60 00 24 01 E0 00 2F 01 60 00 3E   _`___`_$___/_`_>

我在C#中创建了这些文件,但这里是它的代码(它使用带有由crlfs分隔的单词的txt文件)

static void Main(string[] args)
{
    const string fIn = @"C:\projects\droid\WriteFiles\input\allwords.txt";
    const string fwordxOut = @"C:\projects\droid\WriteFiles\output\wordx.dat";
    const string fWordOut = @"C:\projects\droid\WriteFiles\output\word.dat";

    int i = 0;
    int offset = 0;
    int j = 0;
    var lines = File.ReadLines(fIn);

    FileStream stream = new FileStream(fwordxOut, FileMode.Create, FileAccess.ReadWrite);
    using (EndianBinaryWriter wwordxOut = new EndianBinaryWriter(EndianBitConverter.Big, stream))
    {
        using (StreamWriter wWordOut = new StreamWriter(File.Open(fWordOut, FileMode.Create)))
        {
            foreach (var line in lines)
            {
                wWordOut.Write(line);
                i = offset | ((int)line.Length << 22); //first 10 bits to the left is the word size
                offset = offset + (int)line.Length;
                wwordxOut.Write(i);
                //if (j == 7)
                  //  break;
                j++;
            }
        }
    }
}

这是二进制文件搜索的Java代码:

public static void binarySearch() {
    String TAG = "TEST";
    String wordFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/word.dat";
    String wordxFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/wordx.dat";

    String target = "abracadabra"; 
    boolean targetFound = false; 
    int searchCount = 0; 

    try {
        RandomAccessFile raf = new RandomAccessFile(wordxFilePath, "r");
        RandomAccessFile rafWord = new RandomAccessFile(wordFilePath, "r");
        long low = 0;
        long high = (raf.length() / 4) - 1;
        int cur = 0;
        long wordOffset = 0;
        int len = 0;

        while (high >= low) {
            long mid = (low + high) / 2;
            raf.seek(mid * 4);
            cur = raf.readInt();
            Log.v(TAG + "-cur", String.valueOf(cur));

            len = cur >> 22; //word length

            cur = cur & 0x3FFFFF;  //first 10 bits are 0

            rafWord.seek(cur);
            byte [] bytes = new byte[len];

            wordOffset = rafWord.read(bytes, 0, len);
            Log.v(TAG + "-wordOffset", String.valueOf(wordOffset));

            searchCount++;

            String str = new String(bytes);

            Log.v(TAG, str);

            if (target.compareTo(str) < 0) {
                high = mid - 1;
            } else if (target.compareTo(str) == 0) {
                targetFound = true;
                break;
            } else {
                low = mid + 1;
            }
        }

        raf.close();
        rafWord.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    if (targetFound == true) {
        Log.v(TAG + "-found " , String.valueOf(searchCount));
    } else {
        Log.v(TAG + "-not found " , String.valueOf(searchCount));
    }

}

答案 4 :(得分:0)

虽然听起来有点矫枉过正,但不要将数据作为平面文件存储起来。创建数据库并查询数据库中的数据。这应该既有效又快速。

答案 5 :(得分:0)

这是我认为有效的功能(在实践中使用此功能)。行可以有任何长度。您必须提供一个称为“ nav”的lambda来进行实际的行检查,因此您可以灵活地处理文件的顺序(区分大小写,不区分大小写,由特定字段排序等)。

import java.io.File;
import java.io.RandomAccessFile;

class main {
    // returns pair(character range in file, line) or null if not found
    // if no exact match found, return line above
    // nav takes a line and returns -1 (move up), 0 (found) or 1 (move down)
    // The line supplied to nav is stripped of the trailing \n, but not the \r
    // UTF-8 encoding is assumed

    static Pair<LongRange, String> binarySearchForLineInTextFile(File file, IF1<String, Integer> nav) {
        long length = l(file);
        int bufSize = 1024;
        RandomAccessFile raf = randomAccessFileForReading(file);
        try {
            long min = 0, max = length;
            int direction = 0;
            Pair<LongRange, String> possibleResult = null;
            while (min < max) {
                ping();
                long middle = (min + max) / 2;
                long lineStart = raf_findBeginningOfLine(raf, middle, bufSize);
                long lineEnd = raf_findEndOfLine(raf, middle, bufSize);

                String line = fromUtf8(raf_readFilePart(raf, lineStart, (int) (lineEnd - 1 - lineStart)));
                direction = nav.get(line);
                possibleResult = (Pair<LongRange, String>) new Pair(new LongRange(lineStart, lineEnd), line);

                if (direction == 0) return possibleResult;
                // asserts are to assure that loop terminates
                if (direction < 0) max = assertLessThan(max, lineStart);
                else min = assertBiggerThan(min, lineEnd);

            }


            if (direction >= 0) return possibleResult;

            long lineStart = raf_findBeginningOfLine(raf, min - 1, bufSize);
            String line = fromUtf8(raf_readFilePart(raf, lineStart, (int) (min - 1 - lineStart)));

            return new Pair(new LongRange(lineStart, min), line);
        } finally {
            _close(raf);
        }
    }


    static int l(byte[] a) {
        return a == null ? 0 : a.length;
    }

    static long l(File f) {
        return f == null ? 0 : f.length();
    }


    static RandomAccessFile randomAccessFileForReading(File path) {
        try {

            return new RandomAccessFile(path, "r");


        } catch (Exception __e) {
            throw rethrow(__e);
        }
    }

    // you can change this function to allow interrupting long calculations from the outside. just throw a RuntimeException.
    static boolean ping() {
        return true;
    }


    static long raf_findBeginningOfLine(RandomAccessFile raf, long pos, int bufSize) {
        try {
            byte[] buf = new byte[bufSize];
            while (pos > 0) {
                long start = Math.max(pos - bufSize, 0);
                raf.seek(start);
                raf.readFully(buf, 0, (int) Math.min(pos - start, bufSize));
                int idx = lastIndexOf_byteArray(buf, (byte) '\n');
                if (idx >= 0) return start + idx + 1;
                pos = start;
            }
            return 0;
        } catch (Exception __e) {
            throw rethrow(__e);
        }
    }

    static long raf_findEndOfLine(RandomAccessFile raf, long pos, int bufSize) {
        try {
            byte[] buf = new byte[bufSize];
            long length = raf.length();
            while (pos < length) {
                raf.seek(pos);
                raf.readFully(buf, 0, (int) Math.min(length - pos, bufSize));
                int idx = indexOf_byteArray(buf, (byte) '\n');
                if (idx >= 0) return pos + idx + 1;
                pos += bufSize;
            }
            return length;
        } catch (Exception __e) {
            throw rethrow(__e);
        }
    }

    static String fromUtf8(byte[] bytes) {
        try {
            return bytes == null ? null : new String(bytes, "UTF-8");
        } catch (Exception __e) {
            throw rethrow(__e);
        }
    }

    static byte[] raf_readFilePart(RandomAccessFile raf, long start, int l) {
        try {
            byte[] buf = new byte[l];
            raf.seek(start);
            raf.readFully(buf);
            return buf;
        } catch (Exception __e) {
            throw rethrow(__e);
        }
    }

    static <A> A assertLessThan(A a, A b) {
        assertTrue(cmp(b, a) < 0);
        return b;
    }

    static <A> A assertBiggerThan(A a, A b) {
        assertTrue(cmp(b, a) > 0);
        return b;
    }

    static void _close(AutoCloseable c) {
        try {
            if (c != null)
                c.close();
        } catch (Throwable e) {
            throw rethrow(e);
        }
    }


    static RuntimeException rethrow(Throwable t) {

        throw t instanceof RuntimeException ? (RuntimeException) t : new RuntimeException(t);
    }

    static int lastIndexOf_byteArray(byte[] a, byte b) {
        for (int i = l(a) - 1; i >= 0; i--)
            if (a[i] == b)
                return i;
        return -1;
    }

    static int indexOf_byteArray(byte[] a, byte b) {
        int n = l(a);
        for (int i = 0; i < n; i++)
            if (a[i] == b)
                return i;
        return -1;
    }

    static boolean assertTrue(boolean b) {
        if (!b)
            throw fail("oops");
        return b;
    }

    static int cmp(Object a, Object b) {
        if (a == null) return b == null ? 0 : -1;
        if (b == null) return 1;
        return ((Comparable) a).compareTo(b);
    }

    static RuntimeException fail(String msg) {
        throw new RuntimeException(msg == null ? "" : msg);
    }


    final static class LongRange {
        long start, end;

        LongRange(long start, long end) {
            this.end = end;
            this.start = start;
        }

        public String toString() {
            return "[" + start + ";" + end + "]";
        }
    }

    interface IF1<A, B> {
        B get(A a);
    }

    static class Pair<A, B> {
        A a;
        B b;

        Pair(A a, B b) {
            this.b = b;
            this.a = a;
        }

        public String toString() {
            return "<" + a + ", " + b + ">";
        }
    }
}