我使用包tm指定语言和编码在R中创建了一个语料库,如下所示:
de_DE.corpus <- Corpus(VectorSource(de_DE.sample), readerControl
= list(language="de_DE",encoding = "UTF_8"))
de_DE.corpus[36]$content
de_DE.dtm <- DocumentTermMatrix(de_DE.corpus,control = list
(encoding = 'UTF-8'))
inspect(de_DE.dtm[, grepl("grÃ", de_DE.dtm$dimnames$Terms)])
inspect(de_DE.dtm[36, ])
如果我在文档36的de_DE.corpus[36]$content
中看到“ü”文本正确显示的内容。例如“......单身就这样死BegründungderBehördeEine......”
但是当我创建 DocumentTermMatrix (我尝试了多种编码和语言选项)时,我会得到像“begrÔ这样的词,例如“Begründung”这个词。执行inspect(de_DE.dtm[36, ])
后查看结果。
<<DocumentTermMatrix (documents: 1, terms: 21744)>>
Non-/sparse entries: 102/21642
Sparsity : 100%
Maximal term length: 43
Weighting : term frequency (tf)
Sample :
Terms
Docs begrà das dem der die eine einen jobcenter und zum
36 3 4 2 4 8 2 2 4 3 3
如果有人知道如何解决问题,我将不胜感激。在此先感谢:)
答案 0 :(得分:0)
您可以查看输入数据吗?因为你的代码适合我。因此,当您在de_DE.sample中加载它时,我认为您有一个问题。
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.events.Event;
import org.w3c.dom.events.EventListener;
import org.w3c.dom.events.EventTarget;
import org.w3c.dom.html.HTMLIFrameElement;
import org.w3c.dom.Element;
import javafx.application.Application;
import javafx.concurrent.Worker;
import javafx.scene.Scene;
import javafx.scene.control.Label;
import javafx.scene.control.TextField;
import javafx.scene.layout.AnchorPane;
import javafx.scene.layout.BorderPane;
import javafx.scene.web.WebEngine;
import javafx.scene.web.WebView;
import javafx.stage.Stage;
public class getWeblinksInCorner extends Application {
EventListener mouseOverEventListener;
EventListener mouseOutEventListener;
@Override
public void start(final Stage stage) {
final WebView browser = new WebView();
Label textField = new Label();
final WebEngine webEngine = browser.getEngine();
textField.setVisible(false);
webEngine.getLoadWorker().stateProperty().addListener((observable, oldValue, newValue) -> {
if (newValue == Worker.State.SUCCEEDED) {
mouseOverEventListener = new EventListener() {
@Override
public void handleEvent(Event ev) {
String href = getNextHref((Element) ev.getTarget());
if (href != null && !href.isEmpty()) {
if (href.startsWith("/")) {
href = ((Element) ev.getTarget()).getBaseURI() + href;
}
textField.setText(href + " ");
textField.setPrefHeight(22);
textField.setMaxWidth(650);
textField.setStyle("-fx-border-color: #C6C6C7; -fx-background-color: #F2F2F2;");
textField.setVisible(true);
}
}
private String getNextHref(Element target) {
while (target.getAttribute("href") == null) {
if (target.toString().contains("HTMLHtmlElement")) {
return "";
}
target = (Element) target.getParentNode();
if (target == null) {
return "";
}
}
return target.getAttribute("href");
}
};
mouseOutEventListener = new EventListener() {
@Override
public void handleEvent(Event ev) {
textField.setVisible(false);
}
};
Document document = webEngine.getDocument();
addListener(document.getElementsByTagName("*"));
}
});
String content = "http://java2s.com";
webEngine.load(content);
TextField tf = new TextField();
tf.textProperty().bind(webEngine.locationProperty());
BorderPane pane = new BorderPane();
AnchorPane ap = new AnchorPane(browser, textField);
AnchorPane.setBottomAnchor(browser, 0.0);
AnchorPane.setLeftAnchor(browser, 0.0);
AnchorPane.setTopAnchor(browser, 0.0);
AnchorPane.setRightAnchor(browser, 0.0);
AnchorPane.setBottomAnchor(textField, 0.0);
AnchorPane.setLeftAnchor(textField, 0.0);
pane.setCenter(ap);
pane.setTop(tf);
Scene scene = new Scene(pane);
stage.setScene(scene);
stage.show();
}
private void addListener(NodeList nodeList) {
for (int i = 0; i < nodeList.getLength(); i++) {
try {
HTMLIFrameElement iFrame = ((HTMLIFrameElement) nodeList.item(i));
addListener(iFrame.getContentDocument().getElementsByTagName("*"));
} catch (Exception e) {
Element el = (Element) nodeList.item(i);
while (!el.toString().contains("HTMLHtmlElement")) {
el = (Element) el.getParentNode();
((EventTarget) el).removeEventListener("mouseover", mouseOverEventListener, false);
((EventTarget) el).removeEventListener("mouseout", mouseOutEventListener, false);
}
((EventTarget) nodeList.item(i)).addEventListener("mouseover", mouseOverEventListener, false);
((EventTarget) nodeList.item(i)).addEventListener("mouseout", mouseOutEventListener, false);
}
}
}
public static void main(String[] args) {
launch(args);
}
}