刮刮特定网站

时间:2014-05-01 22:22:19

标签: javascript python html web-scraping

我正在开发一个Python项目,其中包括尝试从网站oddsportal.com获取历史体育赔率/结果,例如在确切的网址http://www.oddsportal.com/soccer/england/premier-league/results/

问题在于实际的赔率并没有嵌入HTML中,而是被巨大的Javascript所掩盖。我知道一种可能的方法是使用无头网络驱动程序,它可以解释javascript并以这种方式提取数据,但网站加载了大量无关的东西,这使得这种方法效率不高,所以我很感激在逆向工程方面的一些帮助价值观的来源。

一些信息:

上面的HTML从rb.oddsportal.com和fb.oddsportal加载了一个非常大的globals -.... js文件以及其他js和css文件(这些文件似乎过于相关)。 COM。此外,在2222端口与weblog.livesport.eu进行了小型沟通,似乎很可疑

一般的想法似乎是html包括表中的每个匹配" xeid"值,并为每个参与者添加" xoid"和" xodd"值,以某种方式由javascript操纵到明文值。

感谢任何帮助

1 个答案:

答案 0 :(得分:1)

我之前使用过这个网站,所以这是我的Perl代码(因为你可以看到所有魔法都位于http://www.oddsportal.com/feed/postmatch/1-1-”。$ match-> {id}。“ - 1321390800-1-2.dat“文件):

#!/usr/bin/perl

use Modern::Perl;
use HTML::TreeBuilder::XPath;
use WWW::Mechanize;
use FindBin qw($Bin);
use Getopt::Long;
use DateTime;
use DateTime::Format::Strptime;
use Date::Range;
use Date::Simple;
use JSON::PP;

my $config;

my $result = GetOptions(
    "date=s"   => \$config->{date},
    "league=s" => \$config->{league_id},
    "output"   => \$config->{output_format}
);

( $config->{start_date}, $config->{end_date} ) = split /-/, $config->{date};

unless ( $config->{end_date} ) {

    $config->{end_date} = $config->{start_date};
}

$config->{start_date} = format_date( $config->{start_date} );
$config->{end_date}   = format_date( $config->{end_date} );

my $leagues = {

    1 => {
        title => "English Premier League",
        url =>
          "http://www.oddsportal.com/soccer/england/premier-league/results/"
    },
    2 => {
        title => "Primera Division",
        url =>
          "http://www.oddsportal.com/soccer/spain/primera-division/results/"
    },
    3 => {
        title => "Bundesliga",
        url   => "http://www.oddsportal.com/soccer/germany/bundesliga/results/"
    },
    4 => {
        title => "Ligue 1",
        url   => "http://www.oddsportal.com/soccer/france/ligue-1/results/",
    },
    5 => {
        title => "Serie A",
        url   => "http://www.oddsportal.com/soccer/italy/serie-a/results/",
    },
    6 => {
        title => "Champs League",
        url =>
          "http://www.oddsportal.com/soccer/europe/champions-league/results/",
    },
    7 => {
        title => "Europa League",
        url => "http://www.oddsportal.com/soccer/europe/europa-league/results/",
    },
};

say $leagues->{ $config->{league_id} }->{title};

my $mech = WWW::Mechanize->new();
$mech->agent_alias("Windows IE 6");

$mech->get( $leagues->{ $config->{league_id} }->{url} );

my @matches = find_matches( $mech, $config->{start_date}, $config->{end_date} );

foreach my $match (@matches) {

    collect_info($match);
    save_info($match);
}

sleep 1;

sub collect_info {

    my ($match) = shift;

    my $mech = WWW::Mechanize->new();
    $mech->agent_alias("Windows IE 6");

    say "\t\t", "[$match->{match_date}] $match->{title}";

    #$mech->get( $match->{url} );

    parse_match( $match, $mech );
    sleep 1;
}

#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-1-2.dat 1X2
#http://www.oddsportal.com/feed/postmatch/1-1-1382641-1321390800-1-2.dat

#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-5-2.dat AH
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat OU
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat

#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-6-2.dat DNB
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-12-2.dat EH
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-4-2.dat DC

sub parse_match {

    my ( $match, $mech ) = @_;

    parse_1x2( $match, $mech );
    parse_ou( $match, $mech );

    $mech->save_content("1x2.dat");

    sleep 1;
}

sub parse_ou {

    my ( $match, $mech ) = @_;
    $mech->get( "http://www.oddsportal.com/feed/postmatch/1-1-"
          . $match->{id}
          . "-1321390800-2-2.dat" );

    $mech->save_content("ou.dat");

    my $json = $mech->content();
    $json =~ s/^-\|-|-\|-$//sg;

    my $data = decode_json $json;

    #1.5
    (
        $match->{"pinnacle_over_1.5_price"},
        $match->{"pinnacle_under_1.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{18}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{18} }
      {qw(0 1)}
      : ( "", "" );

    ( $match->{"betfair_over_1.5_price"}, $match->{"betfair_under_1.5_price"} )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    (
        $match->{"betfair_lay_over_1.5_price"},
        $match->{"betfair_lay_under_1.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    (
        $match->{average_home_price}, $match->{average_draw_price},
        $match->{average_away_price}, $match->{highest_home},
        $match->{highest_draw},       $match->{highest_away}
      )
      = find_averages_1x2(
        $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds},
        [
            qw(

              14
              3
              16
              76
              2
              147
              28
              41
              33
              60
              18
              75
              101
              15
              )
        ]
      );

    #2.5

    (
        $match->{"pinnacle_over_2.5_price"},
        $match->{"pinnacle_under_2.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{18}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{18} }
      {qw(0 1)}
      : ( "", "" );

    ( $match->{"betfair_over_2.5_price"}, $match->{"betfair_under_2.5_price"} )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    (
        $match->{"pinnacle_over_3.5_price"},
        $match->{"pinnacle_under_3.5_price"}
      )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{18}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{18} }
      {qw(0 1)}
      : ( "", "" );

    ( $match->{"betfair_over_3.5_price"}, $match->{"betfair_under_3.5_price"} )
      =
      (
        defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{44}
      )
      ? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{44} }
      {qw(0 1)}
      : ( "", "" );

    sleep 1;
}

sub parse_1x2 {

    my ( $match, $mech ) = @_;
    $mech->get( "http://www.oddsportal.com/feed/postmatch/1-1-"
          . $match->{id}
          . "-1321390800-1-2.dat" );

    my $json = $mech->content();
    $json =~ s/^-\|-|-\|-$//sg;

    my $data = decode_json $json;

    (
        $match->{pinnacle_home_price},
        $match->{pinnacle_draw_price},
        $match->{pinnacle_away_price}
      )
      = @{ $data->{d}->{oddsdata}->{back}->{"E-1-2-0-0-0"}->{odds}->{18} }
      {qw(0 1 2)};

    (
        $match->{average_home_price}, $match->{average_draw_price},
        $match->{average_away_price}, $match->{highest_home},
        $match->{highest_draw},       $match->{highest_away}
      )
      = find_averages_1x2(
        $data->{d}->{oddsdata}->{back}->{"E-1-2-0-0-0"}->{odds},
        [
            qw(

              14
              3
              16
              76
              2
              147
              28
              41
              33
              60
              18
              75
              101
              15
              )
        ]
      );
}

sub find_averages_ou {

    my ( $bookmakers, $ids ) = @_;
    my ( $avg_home,     $avg_draw,     $avg_away );
    my ( $highest_home, $highest_draw, $highest_away );

    my ( $sum_home, $sum_draw, $sum_away, $counter );

    foreach my $id ( @{$ids} ) {

        $sum_home += $bookmakers->{$id}->{0};
        $sum_draw += $bookmakers->{$id}->{1};
        $sum_away += $bookmakers->{$id}->{2};

        unless ( defined $highest_home ) {

            $highest_home = $bookmakers->{$id}->{0};
            $highest_draw = $bookmakers->{$id}->{1};
            $highest_away = $bookmakers->{$id}->{2};
        }
        else {

            if ( $highest_home < $bookmakers->{$id}->{0} ) {

                $highest_home = $bookmakers->{$id}->{0};
            }

            if ( $highest_draw < $bookmakers->{$id}->{1} ) {

                $highest_draw = $bookmakers->{$id}->{1};
            }

            if ( $highest_away < $bookmakers->{$id}->{2} ) {

                $highest_away = $bookmakers->{$id}->{2};
            }
        }

        $counter++;
    }

    $avg_home = $sum_home / $counter;
    $avg_draw = $sum_draw / $counter;
    $avg_away = $sum_away / $counter;

    return (
        sprintf( "%0.2f", $avg_home ),
        sprintf( "%0.2f", $avg_draw ),
        sprintf( "%0.2f", $avg_away ),
        $highest_home, $highest_draw, $highest_away
    );
}


sub find_averages_1x2 {

    my ( $bookmakers, $ids ) = @_;
    my ( $avg_home,     $avg_draw,     $avg_away );
    my ( $highest_home, $highest_draw, $highest_away );

    my ( $sum_home, $sum_draw, $sum_away, $counter );

    foreach my $id ( @{$ids} ) {

        $sum_home += $bookmakers->{$id}->{0};
        $sum_draw += $bookmakers->{$id}->{1};
        $sum_away += $bookmakers->{$id}->{2};

        unless ( defined $highest_home ) {

            $highest_home = $bookmakers->{$id}->{0};
            $highest_draw = $bookmakers->{$id}->{1};
            $highest_away = $bookmakers->{$id}->{2};
        }
        else {

            if ( $highest_home < $bookmakers->{$id}->{0} ) {

                $highest_home = $bookmakers->{$id}->{0};
            }

            if ( $highest_draw < $bookmakers->{$id}->{1} ) {

                $highest_draw = $bookmakers->{$id}->{1};
            }

            if ( $highest_away < $bookmakers->{$id}->{2} ) {

                $highest_away = $bookmakers->{$id}->{2};
            }
        }

        $counter++;
    }

    $avg_home = $sum_home / $counter;
    $avg_draw = $sum_draw / $counter;
    $avg_away = $sum_away / $counter;

    return (
        sprintf( "%0.2f", $avg_home ),
        sprintf( "%0.2f", $avg_draw ),
        sprintf( "%0.2f", $avg_away ),
        $highest_home, $highest_draw, $highest_away
    );
}

sub format_date {

    my ($date) = shift;

    my ( $day, $month, $year ) = $date =~ m{(\d{2})(\d{2})(\d{4})};

    $date = join( "-", $year, $month, $day );

    return $date;
}

sub find_matches {

    my ( $mech, $start_date, $end_date ) = @_;
    my @matches;

    my ( $year, $day, $month ) = split /-/, $start_date;

    my $season;
    if ( $start_date lt $year . "-08-01" ) {

        $season->{title} = join( "/", $year - 1, $year );
    }
    else {

        $season->{title} = join( "/", $year, $year + 1 );
    }

    say "\t", $season->{title};
    $mech->follow_link( text => $season->{title} );

    process_season( $mech, $season );

    my $range = Date::Range->new( Date::Simple->new($start_date),
        Date::Simple->new($end_date) );

    foreach my $date ( $range->dates ) {

        if ( exists $season->{matches}->{$date} ) {

            push @matches, @{ $season->{matches}->{$date} };
        }
    }

    return @matches;
}

sub process_season {

    my ( $mech, $season ) = @_;

    say "\t\tCollecting season info...";

  PARSE_RESULT_PAGE:
    my $tree = HTML::TreeBuilder::XPath->new_from_content( $mech->content() );

    my ($current_page) = $tree->findvalues('//span[@class = "active-page"]');
    my ($last_page_url) =
      $tree->findvalues('//div[ @id = "pagination"]/a[ last() ]/@href');
    my ($next_page_url) =
      $tree->findvalues('//div[ @id = "pagination"]/a[ last() -1 ]/@href');
    my ($last_page) = $last_page_url =~ m{/(\d+)/$};

    my $match_day;
    foreach my $row (
        $tree->findnodes('//table[ @id = "tournamentTable" ]/tbody/tr') )
    {

        next
          unless ( ( $row->attr('class') eq "center nob-border" )
            || ( $row->attr('class') =~ m/deactivate/ ) );

        if ( $row->attr('class') eq "center nob-border" ) {

            my ($match_day_string) = $row->findvalues('./th[1]/span');
            $match_day = convert_match_day($match_day_string);
        }
        else {

            my $match = get_match_info($row);
            $match->{match_date} = $match_day;

            ( $match->{home}, $match->{away} ) = split /\s+-\s+/,
              $match->{title};
            ( $match->{home_team_goals}, $match->{away_team_goals} ) =
              split /:/, $match->{score};

            if ($match_day) {

                push @{ $season->{matches}->{$match_day} }, $match;
            }
        }
    }

    while ( $current_page != $last_page ) {

        $mech->get( "http://www.oddsportal.com" . $next_page_url );
        goto PARSE_RESULT_PAGE;
    }

    $tree->delete();
    say "\t\tDone!";
}

sub get_match_info {

    my ($row) = shift;
    my $match;

    ( $match->{start_time} ) = $row->findvalues('./td[1]');
    ( $match->{url} )        = $row->findvalues('./td[2]/a/@href');
    $match->{url} = "http://www.oddsportal.com" . $match->{url};

    ( $match->{id} ) = $match->{url} =~ m{(\d+)/$};

    ( $match->{title} ) = $row->findvalues('./td[2]/a');
    ( $match->{score} ) = $row->findvalues('./td[3]');

    return $match;
}

sub convert_match_day {

    my ($text_date) = shift;

    $text_date =~ s/\s+/ /g;

    my $strp = DateTime::Format::Strptime->new(
        pattern  => '%d %B %Y',
        locale   => 'en_US',
        on_error => 'croak',
    );

    my $dt = $strp->parse_datetime($text_date);

    if ($dt) {

        return $dt->ymd();
    }
}