如何逐个下载所有URL并保存在不同的文件夹中

时间:2014-05-21 08:38:59

标签: java html csv selenium download

我有一个html文件,我保存了所有的URL(下载CSV文件的链接)。我想要一个工具/程序,必须逐个浏览每个URL并下载文件,然后将文件保存在指定的文件夹,它将写在同一个html文件中。

html文件是一个包含3列的表 文件名,文件位置和下载URL

在打开新窗口(target=_blank)后,Url会下载CSV文件。下载后,如果没有错误,它会自动关闭子窗口。

我尝试过自动化(Selenium使用java)

但是有一些挑战如下。

  1. 应该等到下载完成
  2. 有时网址可能会显示错误,在这种情况下,它应关闭子窗口并返回父窗口
  3. 我已经通过保持一个观察者来解决第一种情况,该观察者将每秒检查文件是否被下载(通过计算文件夹中的csv文件的数量)

    我可以切换到子窗口并检查是否有任何错误,但如果没有错误,我的驱动程序就会卡在那里。

    如何解决此问题

    用于检查子窗口中是否存在错误的代码

       public boolean foundError(FirefoxDriver driver) {
        System.out.println(browser.getWindowHandle() + "Parent" + parentHandle);
        String child = "";
        int numberOfWindows = 0;
        //return true;
        if (driver.getWindowHandles().size() > 1) {
            for (String winHandle : driver.getWindowHandles()) {
                numberOfWindows++;
    
                if (!parentHandle.equals(winHandle)) {
                    child = winHandle;
                    System.out.println("Child" + winHandle);
                }
    
            }
        }
        if (numberOfWindows > 1) {
            System.out.println("tostring1" + driver.toString());
            if (!parentHandle.equals(child)) {
                driver.switchTo().window(child);
            }
            System.out.println("Switched to child");
            Set set = driver.getWindowHandles();
            System.out.println("Number of windows=" + set.size());
            //  System.out.println("Number of windows="+set.size()+"driver url"+driver.getCurrentUrl());
            //  System.out.println("tostring2"+driver.toString());
            try {
                // WebDriverWait wait1 = new WebDriverWait(driver, 5);
    
                System.out.println("Body text" + driver.findElementByTagName("body").getText());/////////////////////////////Here driver will get stuck
    
                //System.out.println("text"+driver.findElementByClassName("body").toString());
                //  List<WebElement> elements=driver.findElementsByClassName("ErrorBody");elements.size()>0
                if (!driver.findElementByTagName("body").getText().equals("")) {
                    driver.close();
                    driver.switchTo().window(parentHandle);
                    return true;
    
                }
                System.out.println("No error");
                driver.switchTo().window(parentHandle);
                System.out.println("Switched to parent");
    
            } catch (Exception e) {
                System.out.println("Error Catch block page time out:" + e);
                driver.switchTo().window(parentHandle);
                return false;
                //  driver.switchTo().window(parentHandle);
            }
        }
    
        return false;
    
    }
    

1 个答案:

答案 0 :(得分:0)

我使用了不同的方法 使用Jsoup解析html文件并下载

    import java.io.File;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
/**
 *
 * @author nudanesh
 */
public class URLDownload {

    private Document doc;
    String url = "", folder, file;
    private final File sourceFile;
    int i = 1;
    int r = 1, c = 1;
    int anchorCol = 3;
    Library lib;

    URLDownload() {
        lib = new Library();
        sourceFile = new File("Download.html");
        try {

            doc = Jsoup.parse(sourceFile, "UTF-8");
        } catch (IOException ex) {
            Logger.getLogger(URLDownload.class.getName()).log(Level.SEVERE, null, ex);
        }
        //Elements links = doc.select("a[href]");
        Elements rows = doc.select("tr");
        System.out.println("Size=" + rows.size());
        for (Element row : rows) {


                Elements cols = row.getElementsByTag("td");
                c = 1;
                for (Element col : cols) {
                    System.out.println("Row"+r);
                    if (c == 1) {
                        file = col.text();//System.out.println("File in main"+file);
                    } else if (c == 2) {
                        folder = col.text();//System.out.println("Folder in main"+folder);
                    } else {
                        try {
                            url = col.getElementsByTag("a").attr("href");
                        } catch (Exception e) {
                            System.out.print("-");
                        }
                    }

                    c++;
                }
                if (!url.equals("")) {
                    lib.setLocation(file,folder);
                    lib.downloadFile(url);
                }
                url = "";

            i++;
            r++;
        }
    }

    public static void main(String arg[]) {

        new URLDownload();
    }
}

以下是Library类文件

    import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
/**
 *
 * @author nudanesh
 */
public class Library  {

    boolean downloaded = false;
    Thread t;
    int waitTime = 0;
    String baseLoc = "";
    int size = 1024, ByteWritten = 0;
    URL url;
    URLConnection uCon = null;
    String folderLoc = "", file = "firstFile.csv";
    File loc;
    private OutputStream outStream;
    private InputStream is=null;
    private byte[] buf;
    private int ByteRead;
    private int FolderInUrl = 4;
    private boolean rootFolder = true;
    private File resultFile;
    private FileOutputStream fileResult;
    private XSSFWorkbook workbookResult;
    private XSSFSheet sheetResult;
    private int updateExcelRowNum = -1;
    private int updateExcelColNum = -1;
    String date;
    private int waitLimit = 900000;

    Library() {
        /*System.out.print(Calendar.getInstance().toString());
         Date d=new Date();
         String date=d.toString();
         System.out.println(date);*/

        //t = new Thread(this);
       // t.start();

        date = new SimpleDateFormat("yyyy_MM_dd_HH_mm_ss").format(Calendar.getInstance().getTime());
        System.out.print(date);
        baseLoc = date + "/";
        WriteDataToExcel();
        baseLoc += "Business Reports/";
        createRowExcel(updateExcelRowNum);
        updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Report Name");
        updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Path");
        updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Status");
        updateExcel();
    }

    public void setLocation(String a, String b) {
        file = a;
        file += ".csv";
        folderLoc = baseLoc + getFolderPath(b);

   // System.out.println("File Name: "+file);
        // System.out.println("Folder loc: "+folderLoc);
    }

    public String getFolderPath(String b) {
        String path = "";
        try {
            System.out.println("path" + b);
            path = b;
            // path = java.net.URLDecoder.decode(b, "UTF-8");
            String p[] = path.split("/");
            path = "";
            for (int i = FolderInUrl; i < p.length - 1; i++) {
                rootFolder = false;
                p[i] = removeSpacesAtEnd(p[i]);

                path = path + p[i] + "/";
            }

        } catch (Exception ex) {
            Logger.getLogger(Library.class.getName()).log(Level.SEVERE, null, ex);
        }
        return path;
    }

    public void downloadFile(String urlString) {
        // System.out.println("Started");
        try {
            url = new URL(urlString);
        } catch (MalformedURLException ex) {
            Logger.getLogger(Library.class.getName()).log(Level.SEVERE, null, ex);
        }
        try {
            loc = new File(folderLoc);
            if (!loc.exists()) {
                loc.mkdirs();
            }


            outStream = new BufferedOutputStream(new FileOutputStream(folderLoc + file));
            uCon = url.openConnection();
        uCon.setReadTimeout(waitLimit);
                is = uCon.getInputStream();
               downloaded=true;
            buf = new byte[size];

            while ((ByteRead = is.read(buf)) != -1) {
                System.out.println("while executing" + ByteRead);
                outStream.write(buf, 0, ByteRead);
                ByteWritten += ByteRead;
            }

            //System.out.println("Downloaded" + ByteWritten);
            resetCounters();
            createRowExcel(updateExcelRowNum);
            updateRowColExcel(updateExcelRowNum, updateExcelColNum, file);
            updateRowColExcel(updateExcelRowNum, updateExcelColNum, folderLoc);
            if (ByteWritten < 1000) {
                updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Downloaded ");
            } else {
                updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Downloaded ");
            }
            updateExcel();
        } catch (Exception e) {
            System.out.println("error catch" + e);
            resetCounters();
            createRowExcel(updateExcelRowNum);
            updateRowColExcel(updateExcelRowNum, updateExcelColNum, file);
            updateRowColExcel(updateExcelRowNum, updateExcelColNum, folderLoc);
            updateRowColExcel(updateExcelRowNum, updateExcelColNum, "Rejected the Download after waiting " + (waitLimit / 60000) + " minutes");
            updateExcel();
            waitTime = 0;
        } finally {
            try {
                System.out.println("Error in streams");
                if(downloaded)
                is.close();
                outStream.close();
                downloaded= false;
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    public void moveToFolder(String reportName, String path) {

        try {
            File repo = new File(folderLoc + "/" + reportName + ".csv");
            path = folderLoc + "/" + path;
            File pathFolder = new File(path);
            if (!pathFolder.exists()) {
                pathFolder.mkdirs();
            }
            pathFolder = new File(path + reportName + ".csv");
            System.out.println("Path=" + pathFolder.getAbsolutePath() + "\nReport path=" + repo.getAbsolutePath());
            System.out.println("Source" + repo.getAbsolutePath());

            //System.out.println("Status" + repo.renameTo(new File(pathFolder.getAbsolutePath())));
            System.out.println("Status" + Files.move(repo.toPath(), new File(pathFolder.getAbsolutePath()).toPath(), REPLACE_EXISTING));
//Files.

        } catch (Exception e) {
            System.out.println("error while moving" + e);
        }

    }

    public String changeSpecialCharacters(String report) {

        report = report.replaceAll(":", "_");
        return report;
    }

    public String removeSpacesAtEnd(String inputPath) {

        for (int i = inputPath.length() - 1; i >= 0; i--) {
            if (inputPath.charAt(i) != ' ') {
                break;
            } else {
                System.out.println("Before string is" + inputPath);
                inputPath = inputPath.substring(0, i);
                System.out.println("AFter string is" + inputPath);
            }
        }

        return inputPath;
    }

    public void WriteDataToExcel() {

        try {
            // file = new FileInputStream(new File("config.xlsx"));

            //   File resultFolder = new File("Results");
            //   if (resultFolder.exists()) {
            //       deleteDirectory(resultFolder);
            //   }
            // resultFolder.mkdirs();
            if (!new File(baseLoc).exists()) {
                new File(baseLoc).mkdirs();
            }
            resultFile = new File(baseLoc + "Reports info " + date + ".xlsx");
            System.out.println("Path" + resultFile.getAbsolutePath());
            resultFile.createNewFile();
            // rFilePath = resultFile.getAbsolutePath();

            fileResult = new FileOutputStream(resultFile);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        //Get the workbook instance for XLS file
        //      System.out.println("file success");
        XSSFWorkbook workbook = null;

        try {

            workbookResult = new XSSFWorkbook();
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        System.out.println("Opening the browser");
        //Get first sheet from the workbook

        sheetResult = workbookResult.createSheet();
        //sheetResult.set
        //Get iterator to all the rows in current sheet

        //Get iterator to all cells of current row
        //ar.add(folderLocation);
        //  ar.add(firefoxProfileLocation);
    }

    public void updateExcel() {
        try {
            //fileResult.close();

            fileResult = new FileOutputStream(resultFile);
            workbookResult.write(fileResult);
            fileResult.close();
        } catch (Exception e) {
            System.out.println(e);
        }

    }

    public void createRowExcel(int num) {
        updateExcelRowNum++;
        num = updateExcelRowNum;
        sheetResult.createRow(num);

    }

    public void updateRowColExcel(int rnum, int cnum, String value) {
        updateExcelColNum++;
        cnum = updateExcelColNum;
        sheetResult.getRow(rnum).createCell(cnum);
        XSSFCell cell = sheetResult.getRow(rnum).getCell(cnum);
        cell.setCellValue(value);

    }

    public void updateColumn(int rnum, int cnum, String value) {
        XSSFCell cell = sheetResult.getRow(rnum).getCell(cnum);
        cell.setCellValue(value);

    }

    public void resetCounters() {
        updateExcelColNum = -1;

    }

 /*   @Override
    public void run() {
        while (true) {
            if (true) {
                waitTime += 1000;
System.out.println(waitTime);
                if (waitTime > waitLimit) {
                    try {
                        is.close();
                        outStream.close();
                    //downloaded=false;
                        // cancelDownload=true;
                    } catch (Exception ex) {
                        Logger.getLogger(Library.class.getName()).log(Level.SEVERE, null, ex);
                    }

                }
            }
            try {
                Thread.sleep(1000);
            } catch (Exception e) {
            }

        }
    }*/

}