DocumentTermMatrix中的问题与德语语料库

时间:2017-08-07 20:35:16

标签: r encoding utf-8 tm

我使用包tm指定语言和编码在R中创建了一个语料库,如下所示:

de_DE.corpus <- Corpus(VectorSource(de_DE.sample), readerControl
    = list(language="de_DE",encoding = "UTF_8"))
de_DE.corpus[36]$content
de_DE.dtm <- DocumentTermMatrix(de_DE.corpus,control = list
    (encoding = 'UTF-8'))
inspect(de_DE.dtm[, grepl("grÃ", de_DE.dtm$dimnames$Terms)])
inspect(de_DE.dtm[36, ])

如果我在文档36的de_DE.corpus[36]$content中看到“ü”文本正确显示的内容。例如“......单身就这样死BegründungderBehördeEine......”

但是当我创建 DocumentTermMatrix (我尝试了多种编码和语言选项)时,我会得到像“begrÔ这样的词,例如“Begründung”这个词。执行inspect(de_DE.dtm[36, ])后查看结果。

<<DocumentTermMatrix (documents: 1, terms: 21744)>>

Non-/sparse entries: 102/21642

Sparsity : 100%

Maximal term length: 43

Weighting : term frequency (tf)

Sample :

Terms

Docs begrà das dem der die eine einen jobcenter und zum

36     3    4   2  4   8     2    2       4       3  3

如果有人知道如何解决问题,我将不胜感激。在此先感谢:)

1 个答案:

答案 0 :(得分:0)

您可以查看输入数据吗?因为你的代码适合我。因此,当您在de_DE.sample中加载它时,我认为您有一个问题。

import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.dom.events.Event;
import org.w3c.dom.events.EventListener;
import org.w3c.dom.events.EventTarget;
import org.w3c.dom.html.HTMLIFrameElement;
import org.w3c.dom.Element;
import javafx.application.Application;
import javafx.concurrent.Worker;
import javafx.scene.Scene;
import javafx.scene.control.Label;
import javafx.scene.control.TextField;
import javafx.scene.layout.AnchorPane;
import javafx.scene.layout.BorderPane;
import javafx.scene.web.WebEngine;
import javafx.scene.web.WebView;
import javafx.stage.Stage;

public class getWeblinksInCorner extends Application {

    EventListener mouseOverEventListener;
    EventListener mouseOutEventListener;

    @Override
    public void start(final Stage stage) {
        final WebView browser = new WebView();
        Label textField = new Label();

        final WebEngine webEngine = browser.getEngine();

        textField.setVisible(false);

        webEngine.getLoadWorker().stateProperty().addListener((observable, oldValue, newValue) -> {
            if (newValue == Worker.State.SUCCEEDED) {

                mouseOverEventListener = new EventListener() {
                    @Override
                    public void handleEvent(Event ev) {
                        String href = getNextHref((Element) ev.getTarget());
                        if (href != null && !href.isEmpty()) {
                            if (href.startsWith("/")) {
                                href = ((Element) ev.getTarget()).getBaseURI() + href;
                            }
                            textField.setText(href + "   ");
                            textField.setPrefHeight(22);
                            textField.setMaxWidth(650);
                            textField.setStyle("-fx-border-color: #C6C6C7; -fx-background-color: #F2F2F2;");
                            textField.setVisible(true);
                        }
                    }

                    private String getNextHref(Element target) {
                        while (target.getAttribute("href") == null) {
                            if (target.toString().contains("HTMLHtmlElement")) {
                                return "";
                            }
                            target = (Element) target.getParentNode();
                            if (target == null) {
                                return "";
                            }
                        }
                        return target.getAttribute("href");
                    }
                };

                mouseOutEventListener = new EventListener() {

                    @Override
                    public void handleEvent(Event ev) {
                        textField.setVisible(false);
                    }
                };

                Document document = webEngine.getDocument();
                addListener(document.getElementsByTagName("*"));
            }
        });

        String content = "http://java2s.com";
        webEngine.load(content);

        TextField tf = new TextField();
        tf.textProperty().bind(webEngine.locationProperty());
        BorderPane pane = new BorderPane();

        AnchorPane ap = new AnchorPane(browser, textField);
        AnchorPane.setBottomAnchor(browser, 0.0);
        AnchorPane.setLeftAnchor(browser, 0.0);
        AnchorPane.setTopAnchor(browser, 0.0);
        AnchorPane.setRightAnchor(browser, 0.0);
        AnchorPane.setBottomAnchor(textField, 0.0);
        AnchorPane.setLeftAnchor(textField, 0.0);
        pane.setCenter(ap);
        pane.setTop(tf);

        Scene scene = new Scene(pane);
        stage.setScene(scene);
        stage.show();
    }

    private void addListener(NodeList nodeList) {
        for (int i = 0; i < nodeList.getLength(); i++) {
            try {
                HTMLIFrameElement iFrame = ((HTMLIFrameElement) nodeList.item(i));
                addListener(iFrame.getContentDocument().getElementsByTagName("*"));
            } catch (Exception e) {
                Element el = (Element) nodeList.item(i);
                while (!el.toString().contains("HTMLHtmlElement")) {
                    el = (Element) el.getParentNode();
                    ((EventTarget) el).removeEventListener("mouseover", mouseOverEventListener, false);
                    ((EventTarget) el).removeEventListener("mouseout", mouseOutEventListener, false);
                }
                ((EventTarget) nodeList.item(i)).addEventListener("mouseover", mouseOverEventListener, false);
                ((EventTarget) nodeList.item(i)).addEventListener("mouseout", mouseOutEventListener, false);
            }
        }
    }

    public static void main(String[] args) {
        launch(args);
    }
}