使用JavaFX生成的HTML中的img标记为空

时间:2016-10-21 16:01:35

标签: javascript html ajax javafx webengine

我正在尝试使用JavaFX frameweork获取flipkart网站搜索的html。 下面的代码很好地通过一个小故障获取html输出。 顺便说一句,我正在使用这个生成的HTML来浏览flipkart网站。问题 是对应于该产品的jpeg文件大多数不存在于图像标签中 html页面中的产品。我观察到任何产品的前3到6个产品 列出图像标签,否则为空。

具有该标签的那些

<DIV class="_3BTv9X" style="height: 240px; width:200px;">
    <IMG class="_1Nyybr _30XEf0" alt="" src="https://rukminim1.flixcart.com/image/312/312/mobile/d/f/w/motorola-moto-e3-power-pa4c0009in-original-imaemj7xpcfhnu8r.jpeg?q=70"/>
</DIV>

没有标签的那些

<DIV class="_3BTv9X" style="height: 240px; width: 200px;">
    <IMG class="_1Nyybr" alt=""/>
</DIV>

以下是生成html的Java程序。 我想有人帮我弄清楚为什么图像标签 是空的,以及如何使用正确的img标签获取html页面。

public class Main extends Application {

    @Override
    public void start(Stage stage) throws Exception {
    stage.setTitle("HTML");
    stage.setWidth(500);
    stage.setHeight(500);
    //Scene scene = new Scene(new Group());
    //VBox root = new VBox();    
    final WebView browser = new WebView();
    final WebEngine webEngine = browser.getEngine();

    webEngine.load("https://www.flipkart.com/search?q=Motorola&otracker=start&as-show=on&as=off");
    webEngine.getLoadWorker().stateProperty().addListener((observable, oldState, newState) -> {
            if (newState ==  Worker.State.SUCCEEDED) {
                try {
                    ByteArrayOutputStream b = new ByteArrayOutputStream();
                    printDocument(webEngine.getDocument(), b);
                    System.out.println(b.toString());
                    //FlipkartScrape(b.toString());
                    Platform.exit();
                } catch(Exception e) {
                    System.out.println("Caught Exception");
                    Platform.exit();
                }
            }
            });

    //webEngine.load(null);
    //Hyperlink hpl = new Hyperlink("https://www.flipkart.com");

    //root.getChildren().addAll(hpl,browser);
    //scene.setRoot(root);

    //stage.setScene(scene);
    //stage.show();
    }

    public static void printDocument(Document doc, OutputStream out) throws IOException, TransformerException {
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer transformer = tf.newTransformer();
        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
        transformer.setOutputProperty(OutputKeys.METHOD, "xml");
        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");

        transformer.transform(new DOMSource(doc), 
                new StreamResult(new OutputStreamWriter(out, "UTF-8")));
    }

    public static void main(String[] args) {
    launch(args);
    }

    public void FlipkartScrape(String html){
    org.jsoup.nodes.Document doc = Jsoup.parse(html);
    Elements a = doc.select("a[title]");
    for (Element next: a) {
        Element e;
        String title = next.attr("title");
        String href = next.attr("href");

        href.replaceAll("/", "\\\\/");
        if ((e = next.nextElementSibling()) != null) {
            e = e.nextElementSibling();
            if (e == null)
                continue;
            e = e.nextElementSibling();
            if (e == null)
                continue;
        } else
            continue;

        if (href.equalsIgnoreCase(e.attr("href"))) {
            href = "http://www.flipkart.com" + href;
            System.out.println(title);
            System.out.println(e.text());
        } else {
            e = e.nextElementSibling();
            if (e == null) continue;

            System.out.println(title);
            href = "http://www.flipkart.com" + href;
            System.out.println("TEXT"+e.text());
        }

        Element parent = next.parent();
        if (parent != null) {
            parent = parent.parent();
            if (parent == null) continue;
        } else {
            continue;
        }

        e = parent.nextElementSibling();
        if (e != null) {
            Elements imgs = e.select("img[class]");
            for (Element img: imgs) {
                String imghref = img.attr("src");
                System.out.println("IMAGEHREF"+imghref);
            }
        }
    }
    }
}

0 个答案:

没有答案