我有一个包含表格的网站。我想将表数据提取到csv文件。我会定期使用wget获取本网站的html文件。
现在,我想做的是使用perl / shell脚本从这个html文件中提取表数据。
以下代码转换webdata和extract表。这不是我想要做的。我想从html文件中提取表格(我手动下载)。要提取的表:html下面的 JobHistory 。谁可以帮我这个事??。提前致谢
#!/usr/bin/perl
use warnings;
use strict;
use LWP::Simple;
use HTML::TableExtract;
my $html = get 'http://xxx';
my $te = new HTML::TableExtract( headers => ['a', 'b'] );
$te->parse($html);
foreach my $ts ($te->table_states) {
foreach my $row ($ts->rows) {
next unless $row->[0] =~ /\w/; # skip garbage rows
printf "%-20s ==> %.2f\n", $row->[0], $row->[1];
}
}
我的html页面:
<html>
<meta http-equiv="X-UA-Compatible" content="IE=8">
<meta http-equiv="Content-type" content="text/html; charset=UTF-8">
<style type="text/css">
#jobs_paginate span {font-weight:normal}
#jobs .progress {width:8em}
#jobs_processing {top:-1.5em; font-size:1em;
color:#000; background:rgba(255, 255, 255, 0.8)}
</style>
<title>
JobHistory
</title>
<link rel="stylesheet" href="/static/yarn.css">
<style type="text/css">
#layout { height: 100%; }
#layout thead td { height: 3em; }
#layout #navcell { width: 11em; padding: 0 1em; }
#layout td.content { padding-top: 0 }
#layout tbody { vertical-align: top; }
#layout tfoot td { height: 4em; }
</style>
<link rel="stylesheet" href="/static/jquery/themes-1.9.1/base/jquery-ui.css">
<link rel="stylesheet" href="/static/dt-1.9.4/css/jui-dt.css">
<script type="text/javascript" src="/static/jquery/jquery-1.8.2.min.js">
</script>
<script type="text/javascript" src="/static/jquery/jquery-ui-1.9.1.custom.min.js">
</script>
<script type="text/javascript" src="/static/dt-1.9.4/js/jquery.dataTables.min.js">
</script>
<script type="text/javascript" src="/static/yarn.dt.plugins.js">
</script>
<style type="text/css">
#jsnotice { padding: 0.2em; text-align: center; }
.ui-progressbar { height: 1em; min-width: 5em }
</style>
<script type="text/javascript">
$(function() {
$('#nav').accordion({autoHeight:false, active:0});
jobsDataTable = $('#jobs').dataTable({bStateSave : true, "fnStateSave": function (oSettings, oData) { sessionStorage.setItem( oSettings.sTableId, JSON.stringify(oData) ); }, "fnStateLoad": function (oSettings) { return JSON.parse( sessionStorage.getItem(oSettings.sTableId) );}, bJQueryUI:true, sPaginationType: 'full_numbers', iDisplayLength:20, aLengthMenu:[20, 40, 60, 80, 100], 'aaData': jobsTableData, bDeferRender: true, bProcessing: true, aaSorting: [[2, 'desc']], aoColumnDefs:[{'sType':'numeric', 'bSearchable': false, 'aTargets': [ 7, 8, 9, 10 ] }]}).fnSetFilteringDelay(188);
var asInitVals = new Array();
$('tfoot input').keyup( function ()
{ jobsDataTable.fnFilter( this.value, $('tfoot input').index(this) );
} );
$('tfoot input').each( function (i) {
asInitVals[i] = this.value;
} );
$('tfoot input').focus( function () {
if ( this.className == 'search_init' )
{
this.className = '';
this.value = '';
}
} );
$('tfoot input').blur( function (i) {
if ( this.value == '' )
{
this.className = 'search_init';
this.value = asInitVals[$('tfoot input').index(this)];
}
} );
});
</script>
<div id="jsnotice" class="ui-state-error">
This page works best with javascript enabled.
</div>
<script type="text/javascript">
$('#jsnotice').hide();
</script>
<table id="layout" class="ui-widget-content">
<thead>
<tr>
<td colspan="2">
<div id="header" class="ui-widget">
<div id="user">
Logged in as: dr.who
</div>
<div id="logo">
<img src="/static/hadoop-st.png">
</div>
<h1>
JobHistory
</h1>
</div>
</td>
</tr>
</thead>
<tfoot>
<tr>
<td colspan="2">
<div id="footer" class="ui-widget">
</div>
</td>
</tr>
</tfoot>
<tbody>
<tr>
<td id="navcell">
<div id="nav">
<h3>
Application
</h3>
<ul>
<li>
<a href="/jobhistory/about">About</a>
<li>
<a href="/jobhistory/app">Jobs</a>
</ul>
<h3>
Tools
</h3>
<ul>
<li>
<a href="/conf">Configuration</a>
<li>
<a href="/logs">Local logs</a>
<li>
<a href="/stacks">Server stacks</a>
<li>
<a href="/metrics">Server metrics</a>
</ul>
</div>
</td>
<td class="content">
<h2>
Retired Jobs
</h2>
<table id="jobs">
<thead>
<tr>
<th>
Submit Time
</th>
<th>
Start Time
</th>
<th>
Finish Time
</th>
<th class="id">
Job ID
</th>
<th class="name">
Name
</th>
<th>
User
</th>
<th>
Queue
</th>
<th class="state">
State
</th>
<th>
Maps Total
</th>
<th>
Maps Completed
</th>
<th>
Reduces Total
</th>
<th>
Reduces Completed
</th>
</tr>
</thead>
<script type="text/javascript">
var jobsTableData=[
["2015.09.06 11:49:36 GMT","2015.09.06 11:49:42 GMT","2015.09.06 11:53:05 GMT","<a href=xyz</a>","PigLatin:ctx_decode2.pig","hdfswrite","ingest","SUCCEEDED","10","10","39","39"],
["2015.09.06 11:49:59 GMT","2015.09.06 11:50:02 GMT","2015.09.06 11:50:19 GMT","<a href=abc</a>","PigLatin:ctx_decode2.pig","hdfswrite","ingest","SUCCEEDED","1","1","1","1"],
["2015.09.06 11:50:27 GMT","2015.09.06 11:50:34 GMT","2015.09.06 11:51:48 GMT","<a href=wer</a>","PigLatin:ctx_decode2.pig","hdfswrite","ingest","SUCCEEDED","1","1","2","2"],
["2015.09.06 11:50:48 GMT","2015.09.06 11:50:52 GMT","2015.09.06 11:51:10 GMT","<a href=xyz</a>","INSERT INTO TABLE idl...st_summary2_run_data(Stage","hdfswrite","ingest","SUCCEEDED","53","53","100","100"],
["2015.09.06 11:50:58 GMT","2015.09.06 11:51:02 GMT","2015.09.06 11:51:14 GMT","<a href=uig</a>","insert overwrite directory \'\/prod\/mt...layer(Stage","hdfswrite","default","SUCCEEDED","1","1","1","1"],
["2015.09.10 02:58:53 GMT","2015.09.10 02:59:01 GMT","2015.09.10 02:59:20 GMT","<a href=man</a>","Lot Operation Mapreduce","hdfswrite","ingest","SUCCEEDED","66","66","1","1"]
]
</script>
<tbody>
</tbody>
<tfoot>
<tr>
<th>
<input class="search_init" type="text" name="submit_time" value="Submit Time">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Start Time">
</th>
<th>
<input class="search_init" type="text" name="finish_time" value="Finish Time">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Job ID">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Name">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="User">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Queue">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="State">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Maps Total">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Maps Completed">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Reduces Total">
</th>
<th>
<input class="search_init" type="text" name="start_time" value="Reduces Completed">
</th>
</tr>
</tfoot>
</table>
</td>
</tr>
</tbody>