Delphi:有些提示解析这个html表吗?

时间:2014-07-19 08:13:33

标签: html delphi html-parsing

有一段时间我正试图从这个html表中获取数据,我尝试了付费和免费的组件。我试着做一些编码,也没有结果。我有一个类直接为ClientDataSet抛出html表,但是使用这个表它不起作用。任何人都有关于如何获取此html表中的数据的任何提示?或者将它转换为txt / xls / csv或xml的方法?遵循表格的代码:

  WebBrowser1.Navigate('http://site2.aesa.pb.gov.br/aesa/monitoramentoPluviometria.do?metodo=listarMesesChuvasMensais');
  WebBrowser1.OleObject.Document.All.Tags('select').Item(0).Value:= '2013';
  WebBrowser1.OleObject.Document.All.Tags('select').Item(1).Value:= '7';
  WebBrowser1.OleObject.Document.All.Tags('input').Item(1).click;
  Memo1.Text:= WebBrowser1.OleObject.Document.All.Tags('table').Item(10).InnerHTML;
  Memo1.Lines.SaveToFile('table.html');

2 个答案:

答案 0 :(得分:4)

以下内容将从目标页面上的HTML表格中提取数据 并将其加载到ClientDataSet中。

它相当啰嗦,也许正如大卫所说,德尔福 可能不是这项工作的最佳工具。

在我的Form1上,我有一个TEdit,edValue,让我在第一个键入值 HTML表数据中的数据行。我用它作为一种方法来找到表中的 HTML文档。我敢说有更好的方法,但至少我的方法应该比关于嵌入表格的文档布局的硬编码假设更强大,这些假设可能会在页面的变化中存活下来。作者。

从广义上讲,代码的工作原理是首先使用内容来查找HTML表格单元格 我的edValue.Text,然后找到该单元所属的表,然后 从表中填充CDS的字段和数据。

默认情况下,CDS字段设置为255个字符;也许有一个规格 在网页上发布的数据,允许您对某些(如果不是全部)字段使用较小的值。他们都被认为是ftString类型,以避免代码窒息意外的单元格内容。

顺便说一句,在底部是一个实用程序功能,用于在本地保存HTML页面 保存必须单击按钮选择年+月。要重新加载 来自保存文件的WebBrowser,只需使用文件名作为要加载的URL。

TForm1 = class(TForm)
[ ... ]
public
  { Public declarations }
  Doc : IHtmlDocument2;

procedure TForm1.btnFindValueClick(Sender: TObject);
var
  Table : IHTMLTable;
begin
  Doc := WebBrowser1.Document as IHTMLDocument2;
  Table := FindTableByCellValue(edValue.Text);
  Assert(Table <> Nil);
  LoadCDSFromHTMLTable(CDS, Table);
end;

procedure TForm1.LoadCDSFromHTMLTable(DestCDS : TClientDataSet; Table : IHTMLTable);
var
  I,
  J : Integer;
  vTable : OleVariant;
  iRow : IHTMLTableRow;
  FieldName,
  FieldValue : String;
  Field : TField;
const
  cMaxFieldSize = 255;
  scIDFieldName = 'ID';
begin
  //  Use OleVariant instead of IHTMLTable becuse it's less fiddly for doing what follows
  vTable := Table;
  Assert(not DestCDS.Active and (DestCDS.FieldCount = 0));

  //  First create an AutoInc field
  Field := TAutoIncField.Create(Self);
  Field.FieldName := scIDFieldName;
  Field.DataSet := DestCDS;


  // Next create CDS fields from the names in the cells in the first row of the table
  for I := 0 to (vTable.Rows.Item(0).Cells.Length - 1) do begin
    FieldName := vTable.Rows.Item(0).Cells.Item(I).InnerText;
    Field := TStringField.Create(Self);
    // At this point, we might want to clean up the FieldName by removing embedded spaces, etc
    Field.FieldName := FieldName;
    Field.Size := cMaxFieldSize;
    Field.DataSet := DestCDS;
  end;

  DestCDS.DisableControls;
  try
    DestCDS.IndexFieldNames := scIDFieldName;
    DestCDS.CreateDataSet;

    //  Next load the HTML table data into the CDS
    for I := 1 to (vTable.Rows.Length - 1) do begin
      DestCDS.Insert;
      for J := 0 to vTable.Rows.Item(0).Cells.Length - 1 do begin
        FieldValue := vTable.Rows.Item(I).Cells.Item(J).InnerText;
        // the J + 1 is because Fields[0] is the autoinc one
        DestCDS.Fields[J + 1].AsString := FieldValue;
      end;
      DestCDS.Post;
    end;
    DestCDS.First;
  finally
    DestCDS.EnableControls;
  end;
end;

function TForm1.FindTableCellByTagValue(Doc : IHtmlDocument2; const AValue : String) : IHTMLTableCell;
var
  All: IHTMLElementCollection;
  Value: String;
  I,
  Len: Integer;
  E: OleVariant;
  iE : IHTMLElement;
  iT : IHTMLTextElement;
  iC : IHTMLTableCell;
begin
  Result := Nil;
  All := Doc.All;
  if All = Nil then Exit;
  Len := All.Length;

  for I := 0 to Len - 1 do begin
    E := All.Item(I, varEmpty);
    iE := IDispatch(E) as IHTMLElement;
    if Supports(iE, IHTMLTableCell, iC) then begin
      Value := Trim(iE.Get_InnerText);
      if Pos(Trim(AValue), Value) = 1 then begin
        Result := iC;
        Break;
      end
    end
    else
      Continue;
  end;
end;

function TForm1.FindTableByCellValue(Value : String): IHTMLTable;
var
  Node : IHtmlElement;
  iTable : IHTMLTable;
  iCell : IHTMLTableCell;
begin
  Result := Nil;
  iCell := FindTableCellByTagValue(Doc, edValue.Text);
  if iCell = Nil then
    Exit;
  Node := IDispatch(iCell) as IHtmlElement;

  //  if we found a Node with the cell text we were looking for,
  //  we can now find the HTML table to which it belongs

  while Node <> Nil do begin
    Node := Node.parentElement;
    if Supports(Node, IHTMLTable, iTable) then begin
      Result := iTable;
      Break;
    end;
  end;
end;

procedure TForm1.SaveFileLocally(const FileName : String);
var
  PFile: IPersistFile;  // declared in ActiveX unit
begin
  PFile := Doc as IPersistFile;
  PFile.Save(StringToOleStr(FileName), False);
end;

答案 1 :(得分:1)

经过一段时间的学习后,我终于从html表中提取数据了。为了简化,我可以直接从html表中提取数据,而无需解析&#39;这是标签&#39;表&#39;和&#39;项目&#39; 11&#39;项目&#39; 10具有相同的数据,但在单个单元格中。所以我做了什么,我把表中的每个元素都放在html和StringGrid中填充了一个,然后找到了一种通过ClientDataSet直接填充dbgrid的方法。我发布代码(单位)作为一个例子,为此你需要一个人。我要感谢所有在评论中帮助过我的人。随着越来越多的研究,我们发现执行此过程的最佳方法是使用MSHTML。

unit Unit1;

interface

uses
  Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
  Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.OleCtrls, SHDocVw, Vcl.StdCtrls,
  Vcl.Grids, Vcl.DBGrids, Data.DB, Datasnap.DBClient;

type
  TForm1 = class(TForm)
    WebBrowser1: TWebBrowser;
    DBGrid1: TDBGrid;
    StringGrid1: TStringGrid;
    Button1: TButton;
    Button2: TButton;
    ClientDataSet1: TClientDataSet;
    DataSource1: TDataSource;
    ClientDataSet1MunicípioPosto: TStringField;
    ClientDataSet1TotalMensalmm: TStringField;
    ClientDataSet1ClimatologiaMensalmm: TStringField;
    ClientDataSet1Desviomm: TStringField;
    ClientDataSet1Desvio: TStringField;
    ClientDataSet1id: TAutoIncField;
    procedure FormCreate(Sender: TObject);
    procedure Button1Click(Sender: TObject);
    procedure Button2Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation

{$R *.dfm}

procedure TForm1.Button1Click(Sender: TObject);
var
 irow, jcol: Integer;
 ovTable: OleVariant;
begin

 ovTable := WebBrowser1.OleObject.Document.all.tags('table').item(11);
 ShowMessage('Number of Rows: '+IntToStr(ovTable.Rows.Length));
 ShowMessage('Number of Cols: '+IntToStr(ovTable.Rows.Item(0).Cells.Length));
 StringGrid1.RowCount:= ovTable.Rows.Length+1;
 StringGrid1.ColCount:= ovTable.Rows.Item(0).Cells.Length+1;
 for irow := 0 to (ovTable.Rows.Length - 1) do
 begin
   for jcol := 0 to (ovTable.Rows.Item(irow).Cells.Length - 1) do
   begin
     StringGrid1.Cells[jcol+1, irow+1] := ovTable.Rows.Item(irow).Cells.Item(jcol).InnerText;
   end;
 end;
end;

procedure TForm1.Button2Click(Sender: TObject);

var
iRow : Integer;
iCol : Integer;
ovTable: OleVariant;

begin
  ovTable := WebBrowser1.OleObject.Document.all.tags('table').item(11);
  for iRow := 1 to (ovTable.Rows.Length - 1) do
    begin
      ClientDataSet1.Open;
      ClientDataSet1.insert;
      for iCol := 0 to (ovTable.Rows.Item(iRow).Cells.Length - 1) do
      begin
      ClientDataSet1.FieldByname('Município/Posto').AsString:=ovTable.Rows.Item(iRow).Cells.Item(0).InnerText;
      ClientDataSet1.FieldByname('Total Mensal (mm)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(1).InnerText;
      ClientDataSet1.FieldByname('Climatologia Mensal (mm)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(2).InnerText;
      ClientDataSet1.FieldByname('Desvio (mm)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(3).InnerText;
      ClientDataSet1.FieldByname('Desvio (%)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(4).InnerText;

      end;
      ClientDataSet1.Post;
      ClientDataSet1.IndexFieldNames:= 'id';
      ClientDataSet1.First;
  end;

end;

procedure TForm1.FormCreate(Sender: TObject);
begin
  WebBrowser1.Navigate('C:\htmlwiththetable.html');
end;

end