我已经创建了抓取网站的代码。两个问题。
代码应该是递归的,以获取来自同一域的所有链接,但它会在不检索所有链接的情况下停止。我觉得循环功能有问题
模拟函数的测试代码失败。实现foo函数的类似代码对我有用,但这个没有。
类Crawler {
val mainURL = "http://www.eldiario.es"
def getLinksPage(urlToCrawl: String): List[String] = {
val connURL: Try[Document] = Try(Jsoup.connect(urlToCrawl).get())
def links(doc:Document): Try[List[String]] = Try {
val elements = doc.select("a[href]").asScala
val links = elements.map(_.attr("abs:href")).toSeq
val linksURL = links.map(new URL(_))
val targetURL = (new URL(urlToCrawl)).getHost
val listLinks = linksURL.filter(_.getHost == targetURL).map(_.toString).toList
listLinks
}
val getListLinks: Try[List[String]] = for {
a <- connURL
b <- links(a)
} yield b
val pageLinks: List[String] = getListLinks.getOrElse(List[String](urlToCrawl))
println(pageLinks)
pageLinks
}
def loop(ls: List[String], acc: List[String]): List[String] = ls match {
case Nil => acc
case hd::tl => if (!acc.contains(hd)) loop(getLinksPage(hd),hd::acc)
else loop(tl, acc)
}
def getAllLinkPages(mainURL:String)= loop(getLinksPage(mainURL), List(mainURL))
}
class CrawlerSpec extends WordSpec with MockFactory {
trait LinksFixture {
val getLinksPage = stubFunction[String, List[String]]
lazy val crawlerMock = new Crawler() {
override def getLinksPage(urlToCrawl: String) = LinksFixture.this.getLinksPage(urlToCrawl)
}
}
"getLinksPage" should {
"return the links" in new LinksFixture {
getLinksPage when "http://example.com" returns List("http://example.com", "http://example.com/a", "http://example.com/b")
crawlerMock.getLinksPage("http://example.com") shouldBe List("http://example.com", "http://example.com/a", "http://example.com/b")
}
}
}
[info] CrawlerSpec:
[info] getLinksPage
[info] - should return the links *** FAILED ***
[info] scala.MatchError: null
[info] at rbs.Crawler.loop(Crawler.scala:43)
[info] at rbs.Crawler.getAllLinkPages(Crawler.scala:47)
[info] at rbs.Crawler.<init>(Crawler.scala:49)
编辑2:使用mockFunction
[info] CrawlerSpec:
[info] getLinksPage
[info] - should return the links *** FAILED ***
[info] Unexpected call: MockFunction1-1(http://www.eldiario.es)
[info]
[info] Expected:
[info] inAnyOrder {
[info] MockFunction1-1(http://example.com) once (never called - UNSATISFIED)
[info] }
[info]
[info] Actual:
[info] MockFunction1-1(http://www.eldiario.es) (Option.scala:121)