创建标记化程序

时间:2016-10-16 10:21:30

标签: java tokenize stringtokenizer

我需要帮助。我必须编写一个标记化程序。我加载一个文本文件并将其拆分为标记,但我还需要显示单词的最终初始位置和单词长度(来自文本文件)。我会非常感谢你的帮助。在过去的3天里,我一直试图这样做而没有运气,这就是我所做的:

import java.util.StringTokenizer;
import java.io.*;

public class Tokenizer1 { 

public static void main(String[] args) throws FileNotFoundException, IOException {
    BufferedReader br = new BufferedReader(new FileReader("C://text.txt"));
    FileWriter fw=new FileWriter("C://result.txt");
    PrintWriter pw=new PrintWriter(fw);
    StringTokenizer st = new StringTokenizer(br.readLine()," ");
    while (st.hasMoreTokens()) {
        System.out.println(st.nextToken());
    } 
    String[] tokens = "".split(",");
    int tokenStartIndex = 0;
    for (String token : tokens) {
        for (String token : str.split(", ")) {
            System.out.println("token: " + token + ", tokenStartIndex: " +    tokenStartIndex);
            tokenStartIndex += token.length() + 1;
        }
    }
}

1 个答案:

答案 0 :(得分:0)

如果您不需要逐行处理文件,请尝试使用此文件:

public static void main(String[] args) throws FileNotFoundException, IOException {
    FileInputStream fis = new FileInputStream("C:/text.txt");
    StringBuilder sb = new StringBuilder();

    int c;
    while((c = fis.read()) != -1) {
        sb.append((char)c);
    }
    fis.close();

    System.out.println(sb.toString());
    System.out.println("---------------------");

    int start = 0;

    // OPTION 1: using String.split method
    String[] tokens = sb.toString().split("[\\s,]+");
    for(String t : tokens) {
        System.out.println("START: " + start + "\tLENGTH: " + t.length() + "\tWORD: " + t);
        start += t.length();
    }

    start = 0;

    // OPTION 2: using StringTokenizer class
    StringTokenizer st = new StringTokenizer(sb.toString(), ",\t\n\f\r");
    while(st.hasMoreTokens()) {
        String next = st.nextToken();
        System.out.println("START: " + start + "\tLENGTH: " + next.length() + "\tWORD: " + next);
        start += next.length();
    }
}

如果你需要逐行处理文件,你可能想尝试这个:

public static void main(String[] args) throws FileNotFoundException, IOException {
    BufferedReader br = new BufferedReader(new FileReader("C:/text.txt"));

    StringBuilder sb = new StringBuilder();
    String line;
    int lineNumber = -1;
    while ((line = br.readLine()) != null) {
        ++lineNumber;
        sb.append(line);
        System.out.println("\nLINE: " + lineNumber);
        int elementPosition = 0;

        // OPTION 1: using String.split method
        /*String[] lineContents = line.split("[\\s,]+");
        for (String content : lineContents) {
            System.out.println("\tSTART: " + elementPosition + "\tLENGTH: " + content.length() + "\tWORD: " + content);
            elementPosition += content.length();
        }*/

        // OPTION 2: using StringTokenizer class
        StringTokenizer st = new StringTokenizer(sb.toString(), ",\t\n\f\r");
        while(st.hasMoreTokens()) {
            String next = st.nextToken();
            System.out.println("\tSTART: " + elementPosition + "\tLENGTH: " + next.length() + "\tWORD: " + next);
            elementPosition += next.length();
        }
    }
    br.close();
}

我希望这会有所帮助。