Question

我编写了以下方法来确定有问题的文件是用DOS / MAC格式化还是用UNIX行结尾格式化。

我至少看到一个明显的问题： 1.我希望我会在第一次运行时获得EOL，比如说在前1000字节内。这可能会也可能不会发生。

我要求您对此进行审核并提出改进建议，以便加强代码并使其更具通用性。

谢谢。

new FileFormat().discover(fileName, 0, 1000);

然后

public void discover(String fileName, int offset, int depth) throws IOException {

    BufferedInputStream in = new BufferedInputStream(new FileInputStream(fileName));
    FileReader a = new FileReader(new File(fileName));

    byte[] bytes = new byte[(int) depth];
    in.read(bytes, offset, depth);

    a.close();
    in.close();
    int thisByte;
    int nextByte;

    boolean isDos = false;
    boolean isUnix = false;
    boolean isMac = false;

    for (int i = 0; i < (bytes.length - 1); i++) {
        thisByte = bytes[i];
        nextByte = bytes[i + 1];
    if (thisByte == 10 && nextByte != 13) {
            isDos = true;
            break;
        } else if (thisByte == 13) {
            isUnix = true;
            break;
        } else if (thisByte == 10) {
            isMac = true;
            break;
        }
    }
    if (!(isDos || isMac || isUnix)) {
            discover(fileName, offset + depth, depth + 1000);
    } else {
        // do something clever
    }
}

Answer 1

您的方法似乎不必要地复杂化。为什么不：

public class FileFormat {
    public enum FileType { WINDOWS, UNIX, MAC, UNKNOWN }

    private static final char CR = '\r';
    private static final char LF = '\n';

    public static FileType discover(String fileName) throws IOException {    

        Reader reader = new BufferedReader(new FileReader(fileName));
        FileType result = discover(reader);
        reader.close();
        return result;
    }

    private static FileType discover(Reader reader) throws IOException {
        int c;
        while ((c = reader.read()) != -1) {
            switch(c) {        
            case LF: return FileType.UNIX;
            case CR: {
                if (reader.read() == LF) return FileType.WINDOWS;
                return FileType.MAC;
            }
            default: continue;
            }
        }
        return FileType.UNKNOWN;
    }
}

将其置于静态方法中，然后可以调用并使用：

switch(FileFormat.discover(fileName) {
case WINDOWS: ...
case MAC: ...
case UNKNOWN: ...
}

Answer 2

这是一个粗略的实现，它基于一个简单的多数来猜测行结束类型，并在最坏的情况下回退到未知：

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.EnumMap;
import java.util.Map;
import java.util.Scanner;

class LineEndings
{
    private enum ExitState
    {
        SUCCESS, FAILURE;
    }

    public enum LineEndingType
    {
        DOS("Windows"), MAC("Mac OS Classic"), UNIX("Unix/Linux/Mac OS X"), UNKNOWN("Unknown");

        private final String name;
        private LineEndingType(String name)
        {
            this.name = name;
        }

        public String toString()
        {
            if (null == this.name) {
                return super.toString();
            }
            else {
                return this.name;
            }
        }
    }

    public static void main(String[] arguments)
    {
        ExitState exitState = ExitState.SUCCESS;

        File inputFile = getInputFile();

        if (null == inputFile) {
            exitState = ExitState.FAILURE;

            System.out.println("Error: No input file specified.");
        }
        else {
            System.out.println("Determining line endings for: " + inputFile.getName());

            try {
                LineEndingType lineEndingType = getLineEndingType(inputFile);

                System.out.println("Determined line endings: " + lineEndingType);
            }
            catch (java.io.IOException exception) {
                exitState = ExitState.FAILURE;

                System.out.println("Error: " + exception.getMessage());
            }
        }

        switch (exitState) {
        case SUCCESS:
            System.exit(0);
            break;
        case FAILURE:
            System.exit(1);
            break;
        }
    }

    private static File getInputFile()
    {
        File inputFile = null;
        Scanner stdinScanner = new Scanner(System.in);

        while (true) {
            System.out.println("Enter the input file name:");
            System.out.print(">> ");

            if (stdinScanner.hasNext()) {
                String inputFileName = stdinScanner.next();

                inputFile = new File(inputFileName);

                if (!inputFile.exists()) {
                    System.out.println("File not found.\n");
                }
                else if (!inputFile.canRead()) {
                    System.out.println("Could not read file.\n");
                }
                else {
                    break;
                }
            }
            else {
                inputFile = null;
                break;
            }
        }

        System.out.println();

        return inputFile;
    }

    private static LineEndingType getLineEndingType(File inputFile)
        throws java.io.IOException, java.io.FileNotFoundException
    {
        EnumMap<LineEndingType, Integer> lineEndingTypeCount =
            new EnumMap<LineEndingType, Integer>(LineEndingType.class);

        BufferedReader inputReader = new BufferedReader(new FileReader(inputFile));

        LineEndingType currentLineEndingType = null;

        while (inputReader.ready()) {
            int token = inputReader.read();

            if ('\n' == token) {
                currentLineEndingType = LineEndingType.UNIX;
            }
            else if ('\r' == token) {
                if (inputReader.ready()) {
                    int nextToken = inputReader.read();

                    if ('\n' == nextToken) {
                        currentLineEndingType = LineEndingType.DOS;
                    }
                    else {
                        currentLineEndingType = LineEndingType.MAC;
                    }
                }
            }

            if (null != currentLineEndingType) {
                incrementLineEndingType(lineEndingTypeCount, currentLineEndingType);

                currentLineEndingType = null;
            }
        }

        return getMostFrequentLineEndingType(lineEndingTypeCount);
    }

    private static void incrementLineEndingType(Map<LineEndingType, Integer> lineEndingTypeCount, LineEndingType targetLineEndingType)
    {
        Integer targetLineEndingCount = lineEndingTypeCount.get(targetLineEndingType);

        if (null == targetLineEndingCount) {
            targetLineEndingCount = 0;
        }
        else {
            targetLineEndingCount++;
        }

        lineEndingTypeCount.put(targetLineEndingType, targetLineEndingCount);
    }

    private static LineEndingType getMostFrequentLineEndingType(Map<LineEndingType, Integer> lineEndingTypeCount)
    {
        Integer maximumEntryCount = Integer.MIN_VALUE;

        Map.Entry<LineEndingType, Integer> mostFrequentEntry = null;

        for (Map.Entry<LineEndingType, Integer> entry : lineEndingTypeCount.entrySet()) {
            int entryCount = entry.getValue();

            if (entryCount > maximumEntryCount) {
                mostFrequentEntry = entry;
                maximumEntryCount = entryCount;
            }
        }

        if (null != mostFrequentEntry) {
            return mostFrequentEntry.getKey();
        }
        else {
            return LineEndingType.UNKNOWN;
        }
    }
}

Answer 3

这有很多错误。您需要更好地理解FileInputStream类。请注意，read无法保证读取您请求的所有字节。 offset是数组的偏移量，而不是文件。等等。

如何确定文件格式？ DOS / Unix的/ MAC

3 个答案: