使用Selenium(Java)查找德语Telefone号码

时间:2019-04-04 08:33:36

标签: selenium web-scraping webdriver web-crawler

我正在尝试通过带有Selenium Web驱动程序API的正则表达式从德国网站获取Telefon号码(德语格式)。我的问题是我得到了一些误报,但我还不能排除。有人可以帮我优化正则表达式吗?这样我就可以确定我所获得的是100%的电话号码。在代码中,Impressum通常是联系方式的名称,这就是为什么我要在网站上搜索“ Impressum”这个词,然后单击它,然后以字符串形式下载html主体。然后,我使用正则表达式在html正文中查找电话号码。谢谢。

  public void search() {
        jse = (JavascriptExecutor) driver;
        WebElement w = driver.findElement(By.partialLinkText("mpress"));
        if (w.getText().matches("Impressum" ) || w.getText().matches("impressum")){
            w.click();
        }
        impressum.setBody(driver.findElement(By.tagName("body")).getText());   // HTML-body download
    }

    @SuppressWarnings("Duplicates")
    public void TelRegex() {
        final String regex = "([\\+][0-9]{1,3} [ \\.\\-\\/])?  ([\\(][0-9]{1,6}[\\)])?  ([0-9 \\.\\-\\/]{10,20})$";
        final String string = impressum.getBody();

        final Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.COMMENTS);
        final Matcher matcher = pattern.matcher(string);

        while (matcher.find()) {
            System.out.println("Full match: " + matcher.group(0));
        }
    }

在代码中,Impressum通常是联系方式的名称,这就是为什么我要在网站上搜索“ Impressum”这个词,然后单击它,然后以字符串形式下载html主体。然后,我使用正则表达式在html正文中查找电话号码。它为我提供了fone号,但有时还有其他一些不是fone号的数字。

2 个答案:

答案 0 :(得分:0)

班级:

package syed;

import java.util.ArrayList;
import java.util.Objects;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;

public class Syed {
    private static WebDriver driver;    

    @BeforeClass
    public static void setUpClass() {       
        System.setProperty("webdriver.chrome.driver", "C:\\Users\\pburgr\\Desktop\\selenium-tests\\GCH_driver\\chromedriver.exe");
        ChromeOptions options = new ChromeOptions();
        options.addArguments("user-data-dir=C:\\Users\\pburgr\\AppData\\Local\\Google\\Chrome\\User Data");
        driver = new ChromeDriver(options);
        driver.manage().window().maximize();        
    }
    @Before public void setUp() {} @After public void tearDown() {}
    @AfterClass public static void tearDownClass() {
        driver.quit();
    }
    @Test
    public void extractAllPhoneNumbers() {
        ArrayList<String> phoneNumbers = new ArrayList<String>();

        driver.get("https://www.vario-doser.de/");
        WebElement impressumLink = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.xpath("//a[@href='ueber-uns/impressum/']")));
        impressumLink.click();
        WebElement content = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.id("content")));
        String[] contentText = content.getText().split("\\n");

        for (String line: contentText) {
            if (line.length() > 0 && Objects.equals(line.substring(0, 3), "Tel")) {
                phoneNumbers.add(line);
                System.out.println("Extracting: " + line);
            }
            else {
                System.out.println("Textline does not beginn with 'Tel'");
            }
        }
        if (phoneNumbers.size() > 0) {
            System.out.println("Extracted phone numbers:");
            for (String phoneNumber: phoneNumbers) {
                System.out.println(phoneNumber);
            }
        }
        else {
            System.out.println("No phone number found.");
        }

    }   
    public WebDriverWait waitSec(WebDriver driver, int sec) {
        return new WebDriverWait(driver, sec);
    }
}

输出:

Starting ChromeDriver 2.42.591088 (7b2b2dca23cca0862f674758c9a3933e685c27d5) on port 3253
Only local connections are allowed.
Dub 15, 2019 9:46:23 DOP. org.openqa.selenium.remote.ProtocolHandshake createSession
INFO: Detected dialect: OSS

Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Extracting: Tel.: 09721 / 533404
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Extracting: Tel: 0180 / 60 05 85 0
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Textline does not beginn with 'Tel'
Extracted phone numbers:
Tel.: 09721 / 533404
Tel: 0180 / 60 05 85 0

这是您需要的吗?

答案 1 :(得分:0)

根据前缀提取电话号码:

public void extractAllPhoneNumbers() {
    ArrayList<String> phoneNumbers = new ArrayList<String>();

    driver.get("https://www.vario-doser.de/");
    WebElement impressumLink = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.xpath("//a[@href='ueber-uns/impressum/']")));
    impressumLink.click();
    WebElement content = waitSec(driver, 5).until(ExpectedConditions.elementToBeClickable(By.id("content")));
    String[] contentText = content.getText().split("\\n");

    String[] prefixes = {"0180 / ", "09721 / "};

    for (String line: contentText) {
        for (String prefix: prefixes) {
            if (line.contains(prefix)) {
                phoneNumbers.add(line);
                System.out.println("Extracting: " + line.split(prefix)[1]);
            }
            else {
                System.out.println("Textline does not contain any of the prefixes.");
            }
        }
    }
    if (phoneNumbers.size() > 0) {
        System.out.println("Extracted phone numbers:");
        for (String phoneNumber: phoneNumbers) {
            System.out.println(phoneNumber);
        }
    }
    else {
        System.out.println("No phone number found.");
    }

}

但是它包括传真。

...
Textline does not contain any of the prefixes.
Extracted phone numbers:
Tel.: 09721 / 533404
Fax: 09721 / 533405
Tel: 0180 / 60 05 85 0