使用Python

时间:2016-07-28 13:02:00

标签: python html beautifulsoup

我已经重写了完整的代码来使用beautifulsoup来获取href和src链接,这次是由许多SO用户的请求而不是正则表达式。这是代码:

import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse

path = urlpars(http://www.example.com/dynamic/search.aspx?searchtype=cat&class_id=2566&city_id=55)
lpath = os.path.dirname(path.path)

html = u"<html class=\"\"><head id=\"pageHead\"><title>\n    Beauty Salons | Best Beauty Care &amp; Treatments | Listings @ Phonebook Online\n</title>\n    <!--\n    <meta http-equiv=\"Cache-Control\" content=\"no-cache, no-store, must-revalidate\" /><meta http-equiv=\"Pragma\" content=\"no-cache\" /><meta http-equiv=\"Expires\" content=\"0\" />\n    -->\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><link rel=\"stylesheet\" href=\"../css_responsive/category.css\" type=\"text/css\" media=\"screen\">\n    <script async=\"\" src=\"//www.google-analytics.com/analytics.js\"></script><script async=\"\" src=\"//www.google.com/adsense/search/async-ads.js\"></script><script type=\"text/javascript\" src=\"../styles/scripts/jquery-1.9.1.min.js\"></script>\n    <link rel=\"shortcut icon\" type=\"image/png\" href=\"/PhoneBook.ico\">\n    <!-- #Begin Css Plugin -->\n    <link rel=\"stylesheet\" href=\"../css_responsive/fontsss.css\"><link rel=\"stylesheet\" href=\"../css_responsive/bootstrap-3.3.4-dist/css/bootstrap.css\" type=\"text/css\" media=\"screen\"><link rel=\"stylesheet\" href=\"../styles/scripts/fancybox/jquery.fancybox.css\" type=\"text/css\" media=\"screen\"><link rel=\"stylesheet\" href=\"../css_responsive/icon-detail.css\" type=\"text/css\" media=\"screen\">\n    <!-- #Finish Css Plugin-->\n    <!--<script src=\"http://www.google.com/adsense/search/ads.js\" type=\"text/javascript\"></script> -->\n    <script type=\"text/javascript\" charset=\"utf-8\">\n            (function (G, o, O, g, L, e) {\n                G[g] = G[g] || function () {\n                    (G[g]['q'] = G[g]['q'] || []).push(\n       arguments)\n                }, G[g]['t'] = 1 * new Date; L = o.createElement(O), e = o.getElementsByTagName(\n       O)[0]; L.async = 1; L.src = '//www.google.com/adsense/search/async-ads.js';\n                e.parentNode.insertBefore(L, e)\n            })(window, document, 'script', '_googCsa');\n    </script>\n    <!-- Script For Mobile Base Banner-->\n        <script async=\"\" src=\"//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js\"></script>\n        <script>\n            (adsbygoogle = window.adsbygoogle || []).push({\n                google_ad_client: \"ca-pub-6517686434458516\",\n                enable_page_level_ads: true\n            });\n        </script>\n    <!-- Script For Mobile Base Banner END-->\n\n\n    <script type=\"text/javascript\">\n        function AddClass(Class, Element, HasPriority) {\n            if (HasPriority == 0) {\n                this.className = 'container ' + Class;\n            }\n        }\n    </script>\n    \n<meta name=\"description\" content=\"Best Beauty Salons in Abbottabad for quality beauty care and treatments. \"><meta name=\"keywords\" content=\"beauty salons,beauty care,beauty treatments\"><style type=\"text/css\">.fancybox-margin{margin-right:17px;}</style></head>\n<body style=\"text-shadow: rgba(255, 255, 255, 0.4) 0px 1px 1px; background-color: rgb(240, 240, 240);\">\n<div class=\"wapper\">\n        <div class=\"pagecontent search_width c-no-t-margin\">\n            <div class=\"cblock ele-margin-t-b-15 m-on-mob-hide\"><a href=\"../../default.aspx\">Home</a> &gt; <a href=\"../../dynamic/categories.aspx\">Search by category</a> &gt; <a href=\"../../dynamic/categories.aspx?class_id=12\">Personal Care</a> &gt; <a href=\"../../dynamic/categories.aspx?class_id=134\">Barbers, Beauty Salons &amp; Spas</a> &gt; Beauty Salons in Abbottabad</div>\n            <div class=\"refine\">\n                <span>Refine Result</span>\n                <span>Show Result With</span>\n                <ul>\n                    <li>\n                        <input class=\"csortType csortTypeAll \" type=\"checkbox\" value=\"100\" name=\"\" checked=\"checked\" disabled=\"disabled\">\n                        <span class=\"\">All</span>\n                    </li>\n                    <li>\n                        <input class=\"csortType css-checkbox\" type=\"checkbox\" value=\"1\" name=\"\">\n                        <i class=\"icon-star-full c-icon-starfull-stroke\"></i>\n                        <span>Reviews</span>\n                    </li>\n                    <li>\n                        <input class=\"csortType\" type=\"checkbox\" value=\"2\" name=\"\">\n                        <i class=\"icon-price-tag cColor-Red\"></i>\n                        <span>Deals &amp; Coupons</span>\n                    </li>\n                     <li>\n                        <input class=\"csortType\" type=\"checkbox\" value=\"5\" name=\"\">\n                        <i class=\"icon-bullhorn\"></i>\n                        <span>Announcements</span>\n                    </li>\n                    <li>\n                        <input class=\"csortType\" type=\"checkbox\" value=\"3\" name=\"\">\n                        <i class=\"icon-location\"></i>\n                        <span>Map</span>\n                    </li>\n                    <li>\n                        <input class=\"csortType\" type=\"checkbox\" value=\"4\" name=\"\">\n                        <i class=\"icon-film\"></i>\n                        <span>Video</span>\n                    </li>\n                </ul>\n                \n                <div class=\"tab\" onclick=\"SlideTogle('Location')\">\n                    Search by location\n                </div>\n                \n                        <ul id=\"Location\" style=\"display: none;\">\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=1\">Karachi</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=2\">Lahore</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=56\">Islamabad</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=79\">Rawalpindi</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=49\">Faisalabad</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=81\">Gujranwala</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=78\">Peshawar</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=82\">Sialkot</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=53\">Sargodha</a></li>\n                    \n                        </ul>\n                    \n                <div class=\"tab\" onclick=\"SlideTogle('Category')\">\n                    Search by category\n                </div>\n                \n                        <ul id=\"Category\" style=\"display: none;\">\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2571\">Hairstylists</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2575\">Hair Removal, Wax, Threading Body &amp; Face</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2584\">Manicuring</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2574\">Nail Salons &amp; Services</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2572\">Spas-Beauty, Health And Destination</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2564\">Beauty Institutes</a></li>\n                    \n                        <li><a href=\"search.aspx?searchtype=cat&amp;class_id=2569\">Estheticians</a></li>\n                    \n                        </ul>\n            </div>\n            <div id=\"cResultMainControl\">\n                <div class=\"result_hldr\" id=\"cResultContainer\">\n                    <div class=\"h1\"><h1>Beauty Salons in Abbottabad.</h1></div>\n                    <div class=\"h1 page_desc cfont-12 cNo-Margin ele-pad-r-l-20 m-on-mob-hide\"><p class=\"cNo-Margin margin-t m-ele-top-no-margin \" style=\"line-height:18px;\">Best Beauty Salons in Abbottabad for quality beauty care and treatments, <a href=\"http://www.phonebook.com.pk/dynamic/search.aspx?SearchType=kl&amp;k=bridal+makeup\" title=\"Bridal Makeup\" target=\"_blank\">bridal makeup</a>, <a href=\"http://www.phonebook.com.pk/dynamic/search.aspx?SearchType=kl&amp;k=body+massage\" title=\"Body Massage\" target=\"_blank\">body massage</a>.</p></div>\n                    <div class=\"cMobileHidden col-md-12 col-xs-12 text-center overflow-visible cheight-25 margin-t\" style=\"background-color: rgb(240, 240, 240);\">\n                        <script async=\"\" src=\"//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js\"></script>\n                        <!-- New Line Link Ad -->\n                        <ins class=\"adsbygoogle\" style=\"display:inline-block;width:468px;height:15px;background-color: rgb(240, 240, 240);\" data-ad-client=\"ca-pub-6517686434458516\" data-ad-slot=\"4522680219\"></ins>\n                        <script>\n                            (adsbygoogle = window.adsbygoogle || []).push({});\n                        </script>\n                    </div>\n                    <div id=\"cAlpNav\" class=\"margin-t-10 cAlpNav m-on-mob-hide\">\n                    <div class=\"text-center\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55\">all</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=a\">a</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=b\">b</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=c\">c</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=d\">d</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=e\">e</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=f\">f</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=g\">g</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=h\">h</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=i\">i</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=j\">j</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=k\">k</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=l\">l</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=m\">m</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=n\">n</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=o\">o</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=p\">p</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=q\">q</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=r\">r</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=s\">s</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=t\">t</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=u\">u</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=v\">v</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=w\">w</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=x\">x</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=y\">y</a><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=55&amp;alp=z\">z</a></div></div>\n                    <div>\n                        <div id=\"cListingHldr\" class=\"listing\">\n                        \n<div class=\"container\">\n    <div class=\"comp_info\">\n        <h2><a href=\"../../company/51529-Beena-Beauty-Parlour\">Beena's Beauty Parlour</a></h2>\n        <!--<img class=\"margin-t\" alt=\"Comapny Rating\" src=\"../../images/Stars>.png\" />-->\n        <i class=\"cfont-12 cnoPad left icon-zero-star\"></i>\n        \n            <span class=\"blue margin-t\">(No Review)</span>\n        \n                <span class=\"cfontBold margin-t cColor-Black cColor-SilverDark\">\n                Main Mansehra Road, Near Radio Pakistan, Abbottabad.\n            </span>\n        \n        <div class=\"inline-block  cMobile-Right\">\n            <ul class=\"margin-t cMobile-Text-Align-Right\">\n                <li>\n                    <a data-fancybox-type=\"iframe\" href=\"../../dynamic/emailtocustomer.aspx?Request_ID=26207&amp;comp_name=Beena-Beauty-Parlour&amp;isAdvertizer=0\" class=\"other_links fancybox\">Email</a>\n                </li>\n                 <li>\n                    <a title=\"Call Now\" href=\"tel:+92-992-335556\" class=\"c_circle cMobileShow\"></a>\n                </li>\n                <li>\n                    <a class=\"other_links\" href=\"../../company/51529-Beena-Beauty-Parlour\" title=\"Company Detail\">Detail</a>\n                </li>\n                 \n             </ul>\n        </div>\n    </div>\n    <div class=\"comp_info contact_info\">\n        <strong><a class=\"tel\" href=\"tel:+92-992-335556\">+92-992-335556</a></strong>\n        \n    </div>\n</div>\n<div class=\"container\">\n    <div class=\"comp_info\">\n        <h2><a href=\"../../company/86977-Unique-Beauty-Salon\">Unique Beauty Salon</a></h2>\n        <!--<img class=\"margin-t\" alt=\"Comapny Rating\" src=\"../../images/Stars>.png\" />-->\n        <i class=\"cfont-12 cnoPad left icon-zero-star\"></i>\n        \n            <span class=\"blue margin-t\">(No Review)</span>\n        \n                <span class=\"cfontBold margin-t cColor-Black cColor-SilverDark\">\n                Palki Wedding Hall, Mandian , Abbottabad.\n            </span>\n        \n        <div class=\"inline-block  cMobile-Right\">\n            <ul class=\"margin-t cMobile-Text-Align-Right\">\n                <li>\n                    <a data-fancybox-type=\"iframe\" href=\"../../dynamic/emailtocustomer.aspx?Request_ID=61717&amp;comp_name=Unique-Beauty-Salon&amp;isAdvertizer=0\" class=\"other_links fancybox\">Email</a>\n                </li>\n                 <li>\n                    <a title=\"Call Now\" href=\"tel:+92-313-5856739\" class=\"c_circle cMobileShow\"></a>\n                </li>\n                <li>\n                    <a class=\"other_links\" href=\"../../company/86977-Unique-Beauty-Salon\" title=\"Company Detail\">Detail</a>\n                </li>\n                 \n             </ul>\n        </div>\n    </div>\n    <div class=\"comp_info contact_info\">\n        <strong><a class=\"tel\" href=\"tel:+92-313-5856739\">+92-313-5856739</a></strong>\n        \n    </div>\n</div></div>\n                        <div id=\"cRecoredInfo\" class=\"listing dotted\">Displaying listings from 1 to 10 of 10</div>\n                        <div class=\"text-center m-pad-l-r-10\">\n                            <div id=\"related-suggestions\" class=\"listing inline-block text-center cPad-b-t-10\"><span class=\"left cfont-14\"><b>Related Searches:</b></span> <div class=\"newsssss left inline\" style=\"font-style: italic;font-weight:bold;\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2584\" class=\"left ele-pad-r-l-20 text-underline cfont-14\">Manicuring</a></div><div class=\"newsssss left inline\" style=\"font-style: italic;font-weight:bold;\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2575\" class=\"left ele-pad-r-l-20 text-underline cfont-14\">Hair Removal, Wax, Threading Body &amp; Face</a></div><div class=\"newsssss left inline\" style=\"font-style: italic;font-weight:bold;\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2571\" class=\"left ele-pad-r-l-20 text-underline cfont-14\">Hairstylists</a></div>\n                                <div class=\"text-left ele-margin-t-b-15 left inline\"><b>Need help with your search?</b> Browse by:<a class=\"text-left ele-pad-r-l-20 text-underline\" onclick=\"hide_show('#related-locations',this);$('#related-categories').addClass('hide');\" href=\"javascript:void(0)\">other locations <img alt=\"\" class=\"margin-l\" width=\"18\" src=\"../../images/plus.png\"></a><a class=\"text-left ele-pad-r-l-20 text-underline\" onclick=\"hide_show('#related-categories',this);$('#related-locations').addClass('hide');\" href=\"javascript:void(0)\">similar categories <img alt=\"\" class=\"margin-l\" width=\"18\" src=\"../../images/plus.png\"></a></div><ul id=\"related-locations\" class=\"col-xs-12 col-sm-12 sugesstion-box hide\">\n                                <li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=1\" class=\"left\">Karachi</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=2\" class=\"left\">Lahore</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=56\" class=\"left\">Islamabad</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=79\" class=\"left\">Rawalpindi</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=49\" class=\"left\">Faisalabad</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=81\" class=\"left\">Gujranwala</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=78\" class=\"left\">Peshawar</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=82\" class=\"left\">Sialkot</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2566&amp;city_id=53\" class=\"left\">Sargodha</a></li></ul>\n                                <ul id=\"related-categories\" class=\"col-xs-12 col-sm-12 sugesstion-box hide\">\n                                <li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2574\" class=\"left\">Nail Salons &amp; Services</a></li><li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2572\" class=\"left\">Spas-Beauty, Health And Destination</a></li><li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2564\" class=\"left\">Beauty Institutes</a></li><li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2569\" class=\"left\">Estheticians</a></li></ul>\n                            </div>\n                        </div>\n                        <div class=\"text-center\">\n                        </div>\n                    </div>\n                </div>\n            </div>\n        </div>\n    </div>\n    \n<div class=\"container-fluid bg-silver m-on-mob-hide\">\n    <div class=\"row cPad-b-t-10\" style=\"border-bottom:1px solid #ECECEC;\">\n            \n    </div>\n</div>\n<script>\n    (function (i, s, o, g, r, a, m) {\n        i['GoogleAnalyticsObject'] = r; i[r] = i[r] || function () {\n            (i[r].q = i[r].q || []).push(arguments)\n        }, i[r].l = 1 * new Date(); a = s.createElement(o),\n  m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m)\n    })(window, document, 'script', '//www.google-analytics.com/analytics.js', 'ga');\n\n    ga('create', 'UA-2028280-1', 'auto');\n    ga('send', 'pageview');\n</script>\n<script type=\"text/javascript\" src=\"../css_responsive/script/global_functions.js\"></script>\n<script type=\"text/javascript\" src=\"../styles/scripts/fancybox/jquery.fancybox.js?v=2.1.5\"></script>\n<script type=\"text/javascript\" src=\"../css_responsive/bootstrap-3.3.4-dist/js/bootstrap.js\"></script>\n</body></html>"

soup = BeautifulSoup(html, "lxml")

for allLinks in soup.find_all(href=True):
    if allLinks['href'] and not allLinks['href'].startswith("http") and not allLinks['href'].startswith("jav"):
        print (allLinks['href'])

for allLinks in soup.find_all(src=True):
    if allLinks['src'] and not allLinks['src'].startswith("http") and not allLinks['src'].startswith("jav"):
        print (allLinks['src'])

此代码打印控制台中的所有链接,我可以使用if-elif-else来区分&#34; ../../",&#34; ..成功地将它们更改为绝对路径。 /&#34;,&#34; /&#34;和&#34; //&#34;。但问题是当我尝试使用&#34; re.sub&#34;来替换它们时。整个HTML再次搞砸了。我使用BS4而不是正则表达式,但仍然是同样的问题。由于字符数,我不能在这里发布输出,但为了知识,它也会混淆&#34;&#34;或任何其他html标签。请建议我以任何方式更改这些链接,并将它们放回原来的位置。

注意:根据akashkarothiya's建议,代码最小化。

2 个答案:

答案 0 :(得分:0)

由于akash karothiya的解决方案,它的最终代码和一切都完美无缺。

此代码将所有类型的相对链接更改为任何给定html代码中的绝对链接。

import os, re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote

unquoteURL = unquote("http://webpy_server/?link=http%3A//www.example.com/dynamic/search.aspx%3Fsearchtype%3Dcat%26class_id%3D4520%26page%3D1")

path = urlparse(urlparse(unquoteURL).query.replace("link=", ""))
lpath = os.path.dirname(os.path.abspath(path.path))

html = u"\n<!DOCTYPE html class=\"\"><head id=\"pageHead\"><title>\n    Yarn Manufacturers &amp; Suppliers | Listings @ Phonebook Online\n</title>\n    <!--\n    <meta http-equiv=\"Cache-Control\" content=\"no-cache, no-store, must-revalidate\" /><meta http-equiv=\"Pragma\" content=\"no-cache\" /><meta http-equiv=\"Expires\" content=\"0\" />\n    -->\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><link rel=\"stylesheet\" href=\"../css_responsive/category.css\" type=\"text/css\" media=\"screen\">\n    <script async=\"\" src=\"//www.google-analytics.com/analytics.js\"></script><script async=\"\" src=\"//www.google.com/adsense/search/async-ads.js\"></script><script type=\"text/javascript\" src=\"../styles/scripts/jquery-1.9.1.min.js\"></script>\n    <link rel=\"shortcut icon\" type=\"image/png\" href=\"/PhoneBook.ico\">\n    <!-- #Begin Css Plugin -->\n    <link rel=\"stylesheet\" href=\"../css_responsive/fontsss.css\"><link rel=\"stylesheet\" href=\"../css_responsive/bootstrap-3.3.4-dist/css/bootstrap.css\" type=\"text/css\" media=\"screen\"><link rel=\"stylesheet\" href=\"../styles/scripts/fancybox/jquery.fancybox.css\" type=\"text/css\" media=\"screen\"><link rel=\"stylesheet\" href=\"../css_responsive/icon-detail.css\" type=\"text/css\" media=\"screen\">\n    <!-- #Finish Css Plugin-->\n    <!--<script src=\"http://www.google.com/adsense/search/ads.js\" type=\"text/javascript\"></script> -->\n    <script type=\"text/javascript\" charset=\"utf-8\">\n            (function (G, o, O, g, L, e) {\n                G[g] = G[g] || function () {\n                    (G[g]['q'] = G[g]['q'] || []).push(\n       arguments)\n                }, G[g]['t'] = 1 * new Date; L = o.createElement(O), e = o.getElementsByTagName(\n       O)[0]; L.async = 1; L.src = '//www.google.com/adsense/search/async-ads.js';\n                e.parentNode.insertBefore(L, e)\n            })(window, document, 'script', '_googCsa');\n    </script>\n    <!-- Script For Mobile Base Banner-->\n        <script async=\"\" src=\"//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js\"></script>\n        <script>\n            (adsbygoogle = window.adsbygoogle || []).push({\n                google_ad_client: \"ca-pub-6517686434458516\",\n                enable_page_level_ads: true\n            });\n        </script>\n    <!-- Script For Mobile Base Banner END-->\n\n\n    <script type=\"text/javascript\">\n        function AddClass(Class, Element, HasPriority) {\n            if (HasPriority == 0) {\n                this.className = 'container ' + Class;\n            }\n        }\n    </script>\n    \n<meta name=\"description\" content=\"Online Directory of Yarn Manufacturers &amp; Suppliers in Pakistan, providing list of names, contact numbers, addresses and reviews.\"><meta name=\"keywords\" content=\"Yarn Manufacturers &amp; Suppliers\"><style type=\"text/css\">.fancybox-margin{margin-right:17px;}</style></head>\n<body style=\"text-shadow: rgba(255, 255, 255, 0.4) 0px 1px 1px; background-color: rgb(240, 240, 240);\">\n    <!--Top Nav Bar Start -->\n    \n<div class=\"wapper bg-h\">\n    <div class=\"container-fluid\">\n        <div class=\"col-xs-12 col-md-12\">\n            <div class=\"col-xs-12 col-md-12\">\n                <div class=\"ele-block text-right ele-color-white ele-pad-t-5 m-text-center cMobileTextCenter cfont-12\" style=\"padding-top:5px;\">\n                    <a class=\"\" href=\"../dynamic/free-basic-listing.aspx\"> Free basic listing</a> \n                    | \n                    <a class=\"\" href=\"/advertisement-center/\"> Advertise with us</a>\n                </div>\n            </div>\n        </div>\n    </div>\n    <div class=\"header\">\n        <div class=\"logo\">\n            <div class=\"cMobileHidden left cPad-b-t-25\">\n                <img alt=\"Slider\" height=\"26\" class=\"left\" src=\"../../images/list-icon-slvr.png\" onclick=\"DefaultSliderMenu()\" style=\"cursor:pointer;\">\n            </div>\n            <div class=\"cDesktopHidden cMobileShow\">\n                <img alt=\"Slider\" height=\"26\" class=\"ele-float-left\" src=\"../../images/list-icon-slvr.png\" onclick=\"SlideMenu()\" style=\"cursor:pointer;vertical-align: baseline !important; \">\n            </div>\n            <!--<span class=\"home-slide-icon icon-list2 cPad-b-t-10 cDesktopHidden\" onclick=\"SlideMenu()\"></span>-->\n            <a class=\"left ele-margin-t-b-15 cMobileFloatNone\" style=\"text-decoration:none !important\" href=\"../../\">\n                <img alt=\"Phonebook\" class=\"\" width=\"205\" src=\"../../images/final-logo2s.png\">\n            </a>\n            <div class=\"cDesktopHidden cMobileShow\">\n                <img alt=\"Slider\" width=\"38\" height=\"26\" class=\"ele-float-left\" src=\"/images/magnify-glass-2.png\" onclick=\"enableMobileSearchOption() \" style=\"cursor:pointer;vertical-align: baseline !important; \">\n            </div>\n            <!--<a href=\"../../default.aspx\"><img height=\"60\" alt=\"Phonebook\" src=\"../images/Phonebook-Online-Logo-Big-new2.png\" /></a>-->\n            <!--<h2 class=\"mColorWhite\">Your Online Search Engine</h2>-->\n        </div>\n        <div id=\"cHeader_sky_banner\" class=\"sky_banner\"><embed src=\"http://www.phonebook.com.pk/images/advertisement/swf/79042_8_160614_61864_1.swf\" pluginspage=\"http://www.adobe.com/shockwave/download/download.cgi?P1_Prod_Version=ShockwaveFlash\" width=\"700\" height=\"90\" quality=\"high\" value=\"autostart=true\" wmode=\"transparent\"></div>\n    </div>\n</div>\n<div class=\"wapper bg-h bg-fixed flow-visible m-on-mob-hide\" style=\"top: 0px;\">\n    <div class=\"header\">\n        <form method=\"POST\" action=\"../redirect.aspx?searchtype=kl\">\n            <input class=\"icon-search\" type=\"text\" name=\"keyword\" placeholder=\"What ? (Name or Keyword)\" autocomplete=\"off\" required=\"\">\n            <input class=\"icon-loc\" type=\"text\" name=\"location\" placeholder=\"Where ? (City or Area)\" autocomplete=\"off\">\n            <input class=\"submit\" type=\"submit\" value=\"Find\">\n        </form>\n    </div>\n    <i class=\"after icon-circle-up\"></i>\n</div>\n    <!--Top Nav Bar End -->\n    <div class=\"wapper\">\n        <div class=\"pagecontent search_width c-no-t-margin\">\n            <div class=\"cblock ele-margin-t-b-15 m-on-mob-hide\"><a href=\"../../default.aspx\">Home</a> &gt; <a href=\"../../dynamic/categories.aspx\">Search by category</a> &gt; <a href=\"../../dynamic/categories.aspx?class_id=19\">Industrial supplies &amp; services</a> &gt; <a href=\"../../dynamic/categories.aspx?class_id=234\">Textiles</a> &gt; Yarn Wholesale &amp; Manufacturers in Pakistan</div>\n            \n            \n            \n            <div id=\"cResultMainControl\">\n                <div class=\"result_hldr\" id=\"cResultContainer\">\n                    \n                    \n                    <div class=\"cMobileHidden col-md-12 col-xs-12 text-center overflow-visible cheight-25 margin-t\" style=\"background-color: rgb(240, 240, 240);\">\n                        <script async=\"\" src=\"//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js\"></script>\n                        <!-- New Line Link Ad -->\n                        <ins class=\"adsbygoogle\" style=\"display:inline-block;width:468px;height:15px;background-color: rgb(240, 240, 240);\" data-ad-client=\"ca-pub-6517686434458516\" data-ad-slot=\"4522680219\"></ins>\n                        <script>\n                            (adsbygoogle = window.adsbygoogle || []).push({});\n                        </script>\n                    </div>\n                    <div id=\"cAlpNav\" class=\"margin-t-10 cAlpNav m-on-mob-hide\">\n                    <div class=\"text-center\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520\">all</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=a\">a</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=b\">b</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=c\">c</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=d\">d</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=e\">e</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=f\">f</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=g\">g</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=h\">h</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=i\">i</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=j\">j</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=k\">k</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=l\">l</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=m\">m</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=n\">n</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=o\">o</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=p\">p</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=q\">q</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=r\">r</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=s\">s</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=t\">t</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=u\">u</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=v\">v</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=w\">w</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=x\">x</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=y\">y</a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;alp=z\">z</a></div></div>\n                    <div>\n                        <div id=\"cListingHldr\" class=\"listing\">\n                        \n<div class=\"container\">\n    <div class=\"comp_info\">\n        <h2><a href=\"../../company/77683-A-J-Apparels-Pvt-Ltd\">A &amp; J Apparels (Pvt) Ltd.</a></h2>\n        <!--<img class=\"margin-t\" alt=\"Comapny Rating\" src=\"../../images/Stars>.png\" />-->\n        <i class=\"cfont-12 cnoPad left icon-zero-star\"></i>\n        \n            <span class=\"blue margin-t\">(No Review)</span>\n        \n                <span class=\"cfontBold margin-t cColor-Black cColor-SilverDark\">\n                LA/6-A Block  22, F. B Area, Karachi\n            </span>\n        \n        <div class=\"inline-block  cMobile-Right\">\n            <ul class=\"margin-t cMobile-Text-Align-Right\">\n                <li>\n                    <a data-fancybox-type=\"iframe\" href=\"../../dynamic/emailtocustomer.aspx?Request_ID=8127&amp;comp_name=A-J-Apparels-Pvt-Ltd&amp;isAdvertizer=0\" class=\"other_links fancybox\">Email</a>\n                </li>\n                 <li>\n                    <a title=\"Call Now\" href=\"tel:+92-21-36342521\" class=\"c_circle cMobileShow\"></a>\n                </li>\n                <li>\n                    <a class=\"other_links\" href=\"../../company/77683-A-J-Apparels-Pvt-Ltd\" title=\"Company Detail\">Detail</a>\n                </li>\n                 \n             </ul>\n        </div>\n    </div>\n    <div class=\"comp_info contact_info\">\n        <strong><a class=\"tel\" href=\"tel:+92-21-36342521\">+92-21-36342521</a></strong>\n        \n    </div>\n</div>\n\n\n\n\n\n\n\n\n</div>\n                        <div id=\"cRecoredInfo\" class=\"listing dotted\">Displaying listings from 1 to 10 of 161</div>\n                        <div class=\"text-center m-pad-l-r-10\">\n                            <div id=\"related-suggestions\" class=\"listing inline-block text-center cPad-b-t-10\"><span class=\"left cfont-14\"><b>Related Searches:</b></span> <div class=\"newsssss left inline\" style=\"font-style: italic;font-weight:bold;\"><a href=\"search.aspx?searchtype=cat&amp;class_id=1030\" class=\"left ele-pad-r-l-20 text-underline cfont-14\">Importers</a></div><div class=\"newsssss left inline\" style=\"font-style: italic;font-weight:bold;\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4499\" class=\"left ele-pad-r-l-20 text-underline cfont-14\">Textiles Wholesale &amp; Manufacturers</a></div><div class=\"newsssss left inline\" style=\"font-style: italic;font-weight:bold;\"><a href=\"search.aspx?searchtype=cat&amp;class_id=1029\" class=\"left ele-pad-r-l-20 text-underline cfont-14\">Exporters</a></div>\n                                <div class=\"text-left ele-margin-t-b-15 left inline\"><b>Need help with your search?</b> Browse by:<a class=\"text-left ele-pad-r-l-20 text-underline\" onclick=\"hide_show('#related-locations',this);$('#related-categories').addClass('hide');\" href=\"javascript:void(0)\">other locations <img alt=\"\" class=\"margin-l\" width=\"18\" src=\"../../images/plus.png\"></a><a class=\"text-left ele-pad-r-l-20 text-underline\" onclick=\"hide_show('#related-categories',this);$('#related-locations').addClass('hide');\" href=\"javascript:void(0)\">similar categories <img alt=\"\" class=\"margin-l\" width=\"18\" src=\"../../images/plus.png\"></a></div><ul id=\"related-locations\" class=\"col-xs-12 col-sm-12 sugesstion-box hide\">\n                                <li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=1\" class=\"left\">Karachi</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=2\" class=\"left\">Lahore</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=49\" class=\"left\">Faisalabad</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=77\" class=\"left\">Multan</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=81\" class=\"left\">Gujranwala</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=15\" class=\"left\">Hub</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=79\" class=\"left\">Rawalpindi</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=76\" class=\"left\">Hyderabad</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=62\" class=\"left\">Muzaffar Garh</a></li><li class=\"left cblock margin-l col-xs-3 col-sm-2\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;city_id=60\" class=\"left\">Layyah</a></li></ul>\n                                <ul id=\"related-categories\" class=\"col-xs-12 col-sm-12 sugesstion-box hide\">\n                                <li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4470\" class=\"left\">Knitted Fabrics</a></li><li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=4489\" class=\"left\">Synthetic &amp; Blended Fabrics Wholesale &amp; Manufacturers</a></li><li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2391\" class=\"left\">Aprons Wholesale &amp; Manufacturers</a></li><li class=\"left cblock margin-l col-xs-4 col-sm-4 text-left\"><a href=\"search.aspx?searchtype=cat&amp;class_id=2109\" class=\"left\">Linens Wholesale &amp; Manufacturers</a></li></ul>\n                            </div>\n                        </div>\n                        <div class=\"text-center\">\n                            <div id=\"cPagination\" class=\"listing\">\n                                <img class=\"left\" alt=\"\" src=\"../../images/page-1.png\">\n                            <a id=\"ctl39_cPageUrl\" class=\"pagi_anchor\">\n    <span id=\"ctl39_cAlp\">B</span>\n    <span id=\"ctl39_cPageNo\"></span>\n </a><a href=\"search.aspx?searchtype=cat&amp;class_id=4520&amp;page=1\" id=\"ctl40_cPageUrl\" class=\"pagi_anchor\">\n    <span id=\"ctl40_cAlp\" style=\"color:red !important;;\">O</span>\n    <span id=\"ctl40_cPageNo\" style=\"color:red !important;;\">1</span>\n </a></div>\n                        </div>\n                         \n                    </div>\n                </div>\n            </div>\n            \n\n            <div class=\"srch_banner\"> \n                \n            \n                \n                \n                \n                \n            </div>\n        </div>\n    </div><div style=\"height: 0px; visibility: hidden; font-weight: normal; text-align: center;\"><iframe frameborder=\"0\" marginwidth=\"0\" marginheight=\"0\" allowtransparency=\"true\" scrolling=\"no\" width=\"100%\" name=\"{&quot;name&quot;:&quot;master-2&quot;,&quot;slave-0-2&quot;:{&quot;container&quot;:&quot;adNewNTRSearchPagecontainer2&quot;,&quot;linkTarget&quot;:&quot;_top&quot;,&quot;lines&quot;:3,&quot;colorBackground&quot;:&quot;#e0e0e0&quot;,&quot;colorBorder&quot;:&quot;#0b0b0b&quot;,&quot;fontFamily&quot;:&quot;verdana&quot;,&quot;adIconLocation&quot;:&quot;ad-left&quot;,&quot;width&quot;:&quot;300px&quot;,&quot;type&quot;:&quot;ads&quot;,&quot;hl&quot;:&quot;en&quot;,&quot;columns&quot;:1,&quot;horizontalAlignment&quot;:&quot;left&quot;,&quot;resultsPageQueryParam&quot;:&quot;query&quot;},&quot;master-2&quot;:{&quot;linkTarget&quot;:&quot;_top&quot;,&quot;lines&quot;:3,&quot;colorBackground&quot;:&quot;#e0e0e0&quot;,&quot;colorBorder&quot;:&quot;#0b0b0b&quot;,&quot;fontFamily&quot;:&quot;verdana&quot;,&quot;adIconLocation&quot;:&quot;ad-left&quot;,&quot;width&quot;:&quot;300px&quot;,&quot;type&quot;:&quot;ads&quot;,&quot;hl&quot;:&quot;en&quot;,&quot;columns&quot;:1,&quot;horizontalAlignment&quot;:&quot;left&quot;,&quot;resultsPageQueryParam&quot;:&quot;query&quot;}}\" id=\"master-2\" src=\"https://www.google.com/afs/ads?q=Yarn%20Wholesale%20%26%20Manufacturers&amp;adpage=1&amp;r=m&amp;fexp=21404%2C7000107&amp;client=pub-6517686434458516&amp;channel=3589710218&amp;hl=en&amp;type=0&amp;oe=UTF-8&amp;ie=UTF-8&amp;jsei=3&amp;format=n2&amp;ad=n2&amp;nocache=3631469793737437&amp;num=0&amp;output=uds_ads_only&amp;v=3&amp;allwcallad=1&amp;preload=true&amp;adext=as1%2Csr1%2Cctc1&amp;bsl=10&amp;u_his=3&amp;u_tz=300&amp;dt=1469793737439&amp;u_w=1366&amp;u_h=768&amp;biw=1349&amp;bih=599&amp;psw=1349&amp;psh=1589&amp;frm=0&amp;uio=uv3vp1sl1sr1cc1-wi300ff1&amp;jsv=12350&amp;rurl=http%3A%2F%2Fwww.phonebook.com.pk%2Fdynamic%2Fsearch.aspx%3Fsearchtype%3Dcat%26class_id%3D4520#master-2\" style=\"visibility: hidden; height: 0px;\"></iframe></div>\n    \n<div class=\"container-fluid bg-silver m-on-mob-hide\">\n    <div class=\"row cPad-b-t-10\" style=\"border-bottom:1px solid #ECECEC;\">\n            <!--\n            <div class=\"col-md-12 col-lg-12 col-xs-12\">\n            <img height=\"40\" alt=\"\" src=\"../images/Phonebook-Online-Logo-Big-new.png\" />\n            </div>\n            -->\n    </div>\n</div>\n<div class=\"wapper pad-top-10 footerBg bg-white m-pad-zero\">\n    <div class=\"width footer m-on-mob-hide cMobileHiddenblock \">\n        <ul class=\"list-unstyled col-sm-4 m-on-mob-hide cMobileHidden\">\n            <li class=\"\"><strong style=\"color:#37aef0;\">Popular Keywords :</strong></li>\n            <li>\n                <ul class=\"list-unstyled\">\n                    <li><a href=\"../../dynamic/search.aspx?searchtype=kl&amp;k=restaurants&amp;l=pakistan\">Restaurants</a>,</li>\n                    <li><a href=\"../../dynamic/search.aspx?searchtype=kl&amp;k=pizza&amp;l=pakistan\">Pizza</a>,</li>\n                    <li><a href=\"../../dynamic/search.aspx?searchtype=kl&amp;k=hajj+%26+umrah&amp;l=pakistan\">Hajj &amp; Umrah</a>,</li>\n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                </ul>\n            </li>\n            <li class=\"margin-t\"><strong style=\"color:#37aef0;\">Popular Cities :</strong></li>\n            <li>\n                <ul class=\"list-unstyled\">\n                    <li><a href=\"../../dynamic/city_categories.aspx?city_id=1\">Karachi</a>,</li>\n                    <li><a href=\"../../dynamic/city_categories.aspx?city_id=2\">Lahore</a>,</li>\n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                    \n                    <li><a href=\"../../dynamic/city_categories.aspx?city_id=75\">Sukkur</a></li>\n                </ul>\n            </li>\n        </ul>\n        <ul class=\"col-xs-6 col-sm-2 styled\">\n            <li class=\"\"><strong style=\"color:#37aef0;\">ADVERTISE :</strong></li>\n            <li><a href=\"/advertisement-center/\">Advertise with us</a></li>\n            <li><a href=\"../../dynamic/free-basic-listing.aspx\">Get a Free Listings</a></li>\n            \n        </ul>\n        <ul class=\"col-xs-6 col-sm-2 styled\">\n            <li class=\"\"><strong style=\"color:#37aef0;\">QUICK LINKS :</strong></li>\n            <li><a href=\"../../dynamic/categories.aspx\">Search by Category</a>,</li>\n            \n            \n            <li><a href=\"javascript:void(0)\">Browse by Video</a></li>\n        </ul>\n        <ul class=\"col-xs-6 col-sm-2 styled\">\n            <li class=\"\"><strong style=\"color:#37aef0;\">ABOUT US:</strong></li>\n            <li><a href=\"../../static/contact-us.aspx\">Contact Us</a></li>\n            <li><a href=\"javscript:void(0)\">Report an Error</a></li>\n            \n            \n            \n            \n        </ul>\n        <ul class=\"col-xs-6 col-sm-2 styled\">\n            <li class=\"\"><strong style=\"color:#37aef0;\">PARTNERS:</strong></li>\n            <li><a href=\"http://jang.com.pk/\">Jang Group of Newspapers</a></li>\n            \n            \n            <li><a href=\"http://www.ptcl.com.pk/\">PTCL - White Page Telephone Directory Data</a></li>\n        </ul>\n    </div>\n    <div class=\"col-xs-12 m-footer-wapper m-hidden-on-desktop\">\n        <div class=\"col-xs-3\">\n            <a title=\"Home\" href=\"/\"><img class=\"col-xs-12 cNoPad ele-pad-zero\" alt=\"Home\" src=\"../images/footer-icon-home.png\"></a>\n        </div>\n        <div class=\"col-xs-3\">\n            <a title=\"Free Basic Listing\" href=\"/dynamic/free-basic-listing.aspx\"><img class=\"col-xs-12 cNoPad ele-pad-zero\" alt=\"Home\" src=\"../images/footer-icon-free-listing.png\"></a>\n        </div>\n        <div class=\"col-xs-3\">\n            <a title=\"Contact Us\" href=\"/static/contact-us.aspx\"><img class=\"col-xs-12 cNoPad ele-pad-zero\" alt=\"Home\" src=\"../images/footer-icon-contact.png\"></a>\n        </div>\n        <div class=\"col-xs-3\">\n            <a title=\"Free Basic Listing\" href=\"/advertisement-center\"><img class=\"col-xs-12 cNoPad ele-pad-zero\" alt=\"Home\" src=\"../images/footer-icon-advertisewithus.png\"></a>\n        </div>\n    </div>\n</div>\n\n \n<script>\n    (function (i, s, o, g, r, a, m) {\n        i['GoogleAnalyticsObject'] = r; i[r] = i[r] || function () {\n            (i[r].q = i[r].q || []).push(arguments)\n        }, i[r].l = 1 * new Date(); a = s.createElement(o),\n  m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m)\n    })(window, document, 'script', '//www.google-analytics.com/analytics.js', 'ga');\n\n    ga('create', 'UA-2028280-1', 'auto');\n    ga('send', 'pageview');\n</script>\n \n    <div class=\"modal\" id=\"cSlideMenu\" onclick=\"SlideMenu2()\">\n    </div>\n\n\n<div class=\"slideMenu cfont-12 ie-ele-none\" id=\"defaultSliderMenu\" style=\"max-height: 599px; overflow: auto;\">\n    <ul>\n        <!--\n        <li class=\"ele-pad-t-b-30\"></li>\n        -->\n        <li>\n            <a class=\"icon-circle-down\" href=\"javascript:void(0)\" onclick=\"showSubMenu(this,'.menuSearchType')\">Business search </a>\n            <ul class=\"hide menuSearchType\">\n                <li><a href=\"../../dynamic/categories.aspx\">Search by category</a></li>\n                <li><a href=\"../../dynamic/city_select.aspx\">Search by city</a></li>\n                <li><a href=\"../../searchbyphone.aspx\">Search by phone</a></li>\n                <li><a href=\"../../searchbyaddress.aspx\">Search by address</a></li>\n                <li><a href=\"../../searchbybrand.aspx\">Search by brand</a></li>\n            </ul>\n        </li>\n        <li>\n            <a class=\"icon-circle-down\" href=\"javascript:void(0)\" onclick=\"showSubMenu(this,'.menuSearchFap')\">People search</a>\n            <ul class=\"hide menuSearchFap\">\n                <li><a href=\"../../findaperson/findaperson.aspx?type=name\">Search by name</a></li>\n                <li><a href=\"../../findaperson/findaperson.aspx?type=number\">Search by number</a></li>\n            </ul>\n        </li>\n        <li>\n            <a class=\"icon-circle-down\" href=\"javascript:void(0)\" onclick=\"showSubMenu(this,'.menuGuides')\">Specialized Guides</a>\n            <ul class=\"hide menuGuides\">\n                <li><a href=\"../../dynamic/search.aspx?searchtype=cat&amp;class_id=4710\">Development Sector &amp; NGOs</a></li>\n                <li><a href=\"../../dynamic/search.aspx?searchtype=cat&amp;class_id=863\">Associations &amp; Trade Bodies</a></li>\n                <li><a href=\"../../dynamic/search.aspx?searchtype=cat&amp;class_id=864\">Chambers of Commerce</a></li>\n                <li><a href=\"../../dynamic/search.aspx?SearchType=cat&amp;class_id=1514\">Embassies &amp; Foreign Missions</a></li>\n                <li><a href=\"../../dynamic/categories.aspx?class_Id=65\">Import &amp; Export</a></li>\n                <li><a href=\"../../dynamic/search.aspx?SearchType=cat&amp;class_id=1517\">Federal Government</a></li>\n                <li><a href=\"../../dynamic/categories.aspx?class_id=4638\">Emergency &amp; Complain</a></li>\n                <li><a href=\"../../static/nwdcode.aspx\">NWD Codes</a></li>\n            </ul>\n        </li>\n        <li><a href=\"/advertisement-center/\">Advertise with us</a></li>\n        <li><a href=\"javascript:void(0)\">Help</a></li>\n    </ul>\n</div>\n<div class=\"modal in\" id=\"cSlideMenu\" onclick=\"SlideMenu2()\" aria-hidden=\"false\" style=\"display:none; padding-right: 17px;\">\n</div>\n\n  \n \n<script type=\"text/javascript\" src=\"../css_responsive/script/global_functions.js\"></script>\n<script type=\"text/javascript\" src=\"../styles/scripts/fancybox/jquery.fancybox.js?v=2.1.5\"></script>\n<script type=\"text/javascript\" src=\"../css_responsive/bootstrap-3.3.4-dist/js/bootstrap.js\"></script>\n\n\n</body></html>"

soup = BeautifulSoup(html, "lxml")

all = soup.find_all(href=True)
for i in all:
    try:
        output = re.sub(r'(?is)(href="../../)([^.])', 'href="' + path.scheme + '://' + os.path.normpath(path.netloc) + '/'+r'\2', str(html))
    except:
        output = i

html = output

for i in all:
    try:
        output = re.sub(r'(?is)(href="../)([^.])', 'href="' + path.scheme + '://' + os.path.normpath(path.netloc) + '/'+r'\2', str(html))
    except:
        output = i

html = output

for i in all:
    try:
        output = re.sub(r'(?is)(href="/)([^./])', 'href="' + path.scheme + "://" + path.netloc + '/'+r'\2', str(html))
    except:
        output = i

html = output


for i in all:
    try:
        output = re.sub(r'(?is)(href=")([^.|jav|ht|//|/|../|../../])', 'href="' + path.scheme + '://' + path.netloc + lpath+r'\2', str(html))
    except:
        output = i

html = output

all = soup.find_all(src=True)
for i in all:
    try:
        output = re.sub(r'(?is)(src="../)([^.])', 'src="' + path.scheme + '://' + os.path.normpath(path.netloc) + '/'+r'\2', str(html))
    except:
        output = i

html = output

for i in all:
    try:
        output = re.sub(r'(?is)(src="/)([^./])', 'src="' + path.scheme + "://" + path.netloc + '/'+r'\2', str(html))
    except:
        output = i

html = output

for i in all:
    try:
        output = re.sub(r'(?is)(src="../../)([^.])', 'src="' + path.scheme + '://' + os.path.normpath(path.netloc) + '/'+r'\2', str(html))
    except:
        output = i

html = output

all = soup.find_all(action=True)
for i in all:
    try:
        output = re.sub(r'(?is)(action="../)([^.])', 'action="' + path.scheme + '://' + os.path.normpath(path.netloc) + '/'+r'\2', str(html))
    except:
        output = i

html = output

print (html)

答案 1 :(得分:0)

我找到了re.sub较简单的解决方案,因为它接受一个函数作为replace-with参数。

import re
from urllib.parse import urljoin

abs_url = "https://sample.com/sample-page.html"
my_html = """
    <div class="sample-class">
        <a href="../new-page.html">New page</a>
        <img src="../sample-image.jpg" alt="">
        <img src="../sample-image2.jpg" alt="">
    </div>"""

# "src"
absolutize = lambda m: ' src="' + urljoin(abs_url, m.group(1)) + '"'
my_html = re.sub(r' src="([^"]+)"', absolutize, my_html)
# "href"
absolutize2 = lambda m: ' href="' + urljoin(abs_url, m.group(1)) + '"'
my_html = re.sub(r' href="([^"]+)"', absolutize2, my_html)

# my_html
    """
    <div class="sample-class">
        <a href="https://sample.com/new-page.html">New page</a>
        <img src="https://sample.com/sample-image.jpg" alt="">
        <img src="https://sample.com/sample-image2.jpg" alt="">
    </div>"""

尚未使用多个深度相对URL(例如src="../../hello.jpg")进行测试,但它应该可以正常工作。