我正在创建一个Kimono API,可以在www.builtwith.com(一个列出用于创建网站的软件的网站)上抓取多个页面。
我在和服时很新,所以如果这是一个愚蠢的问题我会道歉。
因此,对于我在builtwith.com上抓取的每个页面,我想获得一个具有一对多关系的嵌套列表。例如,在浏览www.builtwith.com/oracle.com和www.builtwith.com/google.com时,我希望获得类似{{name:Oracle,{webserver:a,b,c},{javascript libraries:x,y,z},...},{name:Google,....}}。
似乎Kimono不允许这种关系并为这些数据制作多个集合 - 请参阅下面的我的API的JSON。有没有什么方法可以格式化和服将这些数据作为嵌套列表?或者我应该使用不同类型的webcrawling?我可以使用BeautifulSoup,但认为它可能更难。
感谢您的帮助。
{
"name": "Builtwith3",
"count": 60,
"frequency": "Manual Crawl",
"version": 2,
"newdata": true,
"lastrunstatus": "success",
"lastsuccess": "Fri Jan 02 2015 05:02:31 GMT+0000 (UTC)",
"thisversionstatus": "success",
"thisversionrun": "Fri Jan 02 2015 05:02:08 GMT+0000 (UTC)",
"results": {
"collection1": [
{
"title": "ORACLE.COM"
},
{
"title": "GOOGLE.COM"
}
],
"collection2": [
{
"curate": {
"href": "http://trends.builtwith.com/Web-Server/Oracle-Application-Server",
"text": "Oracle Application Server"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/ns/UltraDNS-neustar",
"text": "UltraDNS neustar"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/mx/SPF",
"text": "SPF"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/ssl/Symantec-VeriSign",
"text": "Symantec VeriSign"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/hosting/Akamai-Hosted",
"text": "Akamai Hosted"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/cms/SiteStudio",
"text": "SiteStudio"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/framework/J2EE",
"text": "J2EE"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/framework/Oracle-Dynamic-Monitoring-Service",
"text": "Oracle Dynamic Monitoring Service"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/analytics/Omniture-SiteCatalyst",
"text": "Omniture SiteCatalyst"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/analytics/Eloqua",
"text": "Eloqua"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/javascript/jQuery",
"text": "jQuery"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/javascript/Modernizr",
"text": "Modernizr"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/media/Brightcove",
"text": "Brightcove"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/mobile/Viewport-Meta",
"text": "Viewport Meta"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/cdn/Akamai",
"text": "Akamai"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/HTML5-DocType",
"text": "HTML5 DocType"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/Javascript",
"text": "Javascript"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/Cascading-Style-Sheets",
"text": "Cascading Style Sheets"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/Meta-Description",
"text": "Meta Description"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/Meta-Keywords",
"text": "Meta Keywords"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/Meta-Robot",
"text": "Meta Robot"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/X-Frame-Options",
"text": "X-Frame-Options"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/encoding/UTF-8",
"text": "UTF-8"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/css/Min-Width",
"text": "Min Width"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/css/Max-Width",
"text": "Max Width"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/cdns/Akamai-Edge",
"text": "Akamai Edge"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/Web-Server/Apache",
"text": "Apache"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/ns/Google-DNS",
"text": "Google DNS"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/ssl/Google-SSL",
"text": "Google SSL"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/mx/DMARC",
"text": "DMARC"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/mx/SPF",
"text": "SPF"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/mx/Google-Apps-for-Business",
"text": "Google Apps for Business"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/hosting/Google",
"text": "Google"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/cms/dotCMS",
"text": "dotCMS"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/cms/WebBaker",
"text": "WebBaker"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/framework/J2EE",
"text": "J2EE"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/framework/PHP",
"text": "PHP"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/framework/Adobe-ColdFusion",
"text": "Adobe ColdFusion"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/framework/Perl",
"text": "Perl"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/analytics/Fireblade",
"text": "Fireblade"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/javascript/Google-API",
"text": "Google API"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/cdn/GStatic-Google-Static-Content",
"text": "GStatic Google Static Content"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/mapping/Google-Maps",
"text": "Google Maps"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/widgets/Google-Plus-One-Platform",
"text": "Google Plus One Platform"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/X-Frame-Options",
"text": "X-Frame-Options"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/P3P-Policy",
"text": "P3P Policy"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/Javascript",
"text": "Javascript"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/X-XSS-Protection",
"text": "X-XSS-Protection"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/HTML5-DocType",
"text": "HTML5 DocType"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/Canonical-Content-Tag",
"text": "Canonical Content Tag"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/docinfo/WAI-ARIA",
"text": "WAI-ARIA"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/encoding/UTF-8",
"text": "UTF-8"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/Server/Alternate-Protocol",
"text": "Alternate Protocol"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/Server/QUIC",
"text": "QUIC"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/Server/IPv6",
"text": "IPv6"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/css/Device-Pixel-Ratio",
"text": "Device Pixel Ratio"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/css/Min-Width",
"text": "Min Width"
}
},
{
"curate": {
"href": "http://trends.builtwith.com/css/Resolution",
"text": "Resolution"
}
}
]
}
}