我已经为每个令牌找到了POS标签的频率,例如(Np,JJ ..)。我怎样才能找到POS标签bigrams和trigrams发行版? (我正在使用stanford POS tagger java,http://nlp.stanford.edu/software/tagger.shtml) uni-gram的代码是
MaxentTagger tagger = new MaxentTagger("taggers/english-left3words-distsim.tagger");
// The sample string
String sample = "This is a sample text";
// The tagged string
String tagged = tagger.tagString(s);
final String tagged = s;
int nouns = 0;
int adjectives = 0;
int adverbs = 0;
int verbs = 0;
int cd = 0; // Cardinal number
int preposition = 0;
int fw = 0;
int particle = 0;
int symbol = 0;
int conjuction = 0;
int Determiner = 0;
int interrogative = 0;
int prp$ = 0; //Possessive pronoun
final String[] tokens = tagged.split(" ");
for (final String token : tokens) {
//System.out.println(token);
final int lastUnderscoreIndex = token.lastIndexOf("_");
final String realToken = token.substring(lastUnderscoreIndex + 1);
if ("NN".equals(realToken) || "NNS".equals(realToken) || "NNP".equals(realToken) || "NNPS".equals(realToken)) {
nouns++;
}
if ("JJ".equals(realToken) || "JJR".equals(realToken) || "JJR".equals(realToken)) {
adjectives++;
}
if ("RB".equals(realToken) || "RBS".equals(realToken) || "RBR".equals(realToken)) {
adverbs++;
}
if ("VB".equals(realToken) || "VBD".equals(realToken) || "VBG".equals(realToken)
|| "VBN".equals(realToken) || "VBP".equals(realToken) || "VBZ".equals(realToken)) {
verbs++;
}
if ("CD".equals(realToken)) {
cd++;
}
if ("IN".equals(realToken) || "TO".equals(realToken)) {
preposition++;
}
if ("RP".equals(realToken)) {
particle++;
}
if ("SYM".equals(realToken)) {
symbol++;
}
if ("CC".equals(realToken)) {
conjuction++;
}
if ("DT".equals(realToken)) {
Determiner++;
}
if ("WDT".equals(realToken) || "WP".equals(realToken) || "WRB".equals(realToken)) {
interrogative++;
}
if ("FW".equals(realToken)) {
fw++;
}
if ("PRP$".equals(realToken)) {
prp$++;
}
}