我正在开发一个Python项目,其中包括尝试从网站oddsportal.com获取历史体育赔率/结果,例如在确切的网址http://www.oddsportal.com/soccer/england/premier-league/results/
问题在于实际的赔率并没有嵌入HTML中,而是被巨大的Javascript所掩盖。我知道一种可能的方法是使用无头网络驱动程序,它可以解释javascript并以这种方式提取数据,但网站加载了大量无关的东西,这使得这种方法效率不高,所以我很感激在逆向工程方面的一些帮助价值观的来源。
一些信息:
上面的HTML从rb.oddsportal.com和fb.oddsportal加载了一个非常大的globals -.... js文件以及其他js和css文件(这些文件似乎过于相关)。 COM。此外,在2222端口与weblog.livesport.eu进行了小型沟通,似乎很可疑
一般的想法似乎是html包括表中的每个匹配" xeid"值,并为每个参与者添加" xoid"和" xodd"值,以某种方式由javascript操纵到明文值。
感谢任何帮助
答案 0 :(得分:1)
我之前使用过这个网站,所以这是我的Perl代码(因为你可以看到所有魔法都位于“http://www.oddsportal.com/feed/postmatch/1-1-”。$ match-> {id}。“ - 1321390800-1-2.dat“文件):
#!/usr/bin/perl
use Modern::Perl;
use HTML::TreeBuilder::XPath;
use WWW::Mechanize;
use FindBin qw($Bin);
use Getopt::Long;
use DateTime;
use DateTime::Format::Strptime;
use Date::Range;
use Date::Simple;
use JSON::PP;
my $config;
my $result = GetOptions(
"date=s" => \$config->{date},
"league=s" => \$config->{league_id},
"output" => \$config->{output_format}
);
( $config->{start_date}, $config->{end_date} ) = split /-/, $config->{date};
unless ( $config->{end_date} ) {
$config->{end_date} = $config->{start_date};
}
$config->{start_date} = format_date( $config->{start_date} );
$config->{end_date} = format_date( $config->{end_date} );
my $leagues = {
1 => {
title => "English Premier League",
url =>
"http://www.oddsportal.com/soccer/england/premier-league/results/"
},
2 => {
title => "Primera Division",
url =>
"http://www.oddsportal.com/soccer/spain/primera-division/results/"
},
3 => {
title => "Bundesliga",
url => "http://www.oddsportal.com/soccer/germany/bundesliga/results/"
},
4 => {
title => "Ligue 1",
url => "http://www.oddsportal.com/soccer/france/ligue-1/results/",
},
5 => {
title => "Serie A",
url => "http://www.oddsportal.com/soccer/italy/serie-a/results/",
},
6 => {
title => "Champs League",
url =>
"http://www.oddsportal.com/soccer/europe/champions-league/results/",
},
7 => {
title => "Europa League",
url => "http://www.oddsportal.com/soccer/europe/europa-league/results/",
},
};
say $leagues->{ $config->{league_id} }->{title};
my $mech = WWW::Mechanize->new();
$mech->agent_alias("Windows IE 6");
$mech->get( $leagues->{ $config->{league_id} }->{url} );
my @matches = find_matches( $mech, $config->{start_date}, $config->{end_date} );
foreach my $match (@matches) {
collect_info($match);
save_info($match);
}
sleep 1;
sub collect_info {
my ($match) = shift;
my $mech = WWW::Mechanize->new();
$mech->agent_alias("Windows IE 6");
say "\t\t", "[$match->{match_date}] $match->{title}";
#$mech->get( $match->{url} );
parse_match( $match, $mech );
sleep 1;
}
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-1-2.dat 1X2
#http://www.oddsportal.com/feed/postmatch/1-1-1382641-1321390800-1-2.dat
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-5-2.dat AH
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat OU
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-2-2.dat
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-6-2.dat DNB
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-12-2.dat EH
#http://www.oddsportal.com/feed/postmatch/1-1-827202-1321390800-4-2.dat DC
sub parse_match {
my ( $match, $mech ) = @_;
parse_1x2( $match, $mech );
parse_ou( $match, $mech );
$mech->save_content("1x2.dat");
sleep 1;
}
sub parse_ou {
my ( $match, $mech ) = @_;
$mech->get( "http://www.oddsportal.com/feed/postmatch/1-1-"
. $match->{id}
. "-1321390800-2-2.dat" );
$mech->save_content("ou.dat");
my $json = $mech->content();
$json =~ s/^-\|-|-\|-$//sg;
my $data = decode_json $json;
#1.5
(
$match->{"pinnacle_over_1.5_price"},
$match->{"pinnacle_under_1.5_price"}
)
=
(
defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{18}
)
? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{18} }
{qw(0 1)}
: ( "", "" );
( $match->{"betfair_over_1.5_price"}, $match->{"betfair_under_1.5_price"} )
=
(
defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44}
)
? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44} }
{qw(0 1)}
: ( "", "" );
(
$match->{"betfair_lay_over_1.5_price"},
$match->{"betfair_lay_under_1.5_price"}
)
=
(
defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44}
)
? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds}->{44} }
{qw(0 1)}
: ( "", "" );
(
$match->{average_home_price}, $match->{average_draw_price},
$match->{average_away_price}, $match->{highest_home},
$match->{highest_draw}, $match->{highest_away}
)
= find_averages_1x2(
$data->{d}->{oddsdata}->{back}->{"E-2-2-0-1.5-0"}->{odds},
[
qw(
14
3
16
76
2
147
28
41
33
60
18
75
101
15
)
]
);
#2.5
(
$match->{"pinnacle_over_2.5_price"},
$match->{"pinnacle_under_2.5_price"}
)
=
(
defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{18}
)
? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{18} }
{qw(0 1)}
: ( "", "" );
( $match->{"betfair_over_2.5_price"}, $match->{"betfair_under_2.5_price"} )
=
(
defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{44}
)
? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-2.5-0"}->{odds}->{44} }
{qw(0 1)}
: ( "", "" );
(
$match->{"pinnacle_over_3.5_price"},
$match->{"pinnacle_under_3.5_price"}
)
=
(
defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{18}
)
? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{18} }
{qw(0 1)}
: ( "", "" );
( $match->{"betfair_over_3.5_price"}, $match->{"betfair_under_3.5_price"} )
=
(
defined $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{44}
)
? @{ $data->{d}->{oddsdata}->{back}->{"E-2-2-0-3.5-0"}->{odds}->{44} }
{qw(0 1)}
: ( "", "" );
sleep 1;
}
sub parse_1x2 {
my ( $match, $mech ) = @_;
$mech->get( "http://www.oddsportal.com/feed/postmatch/1-1-"
. $match->{id}
. "-1321390800-1-2.dat" );
my $json = $mech->content();
$json =~ s/^-\|-|-\|-$//sg;
my $data = decode_json $json;
(
$match->{pinnacle_home_price},
$match->{pinnacle_draw_price},
$match->{pinnacle_away_price}
)
= @{ $data->{d}->{oddsdata}->{back}->{"E-1-2-0-0-0"}->{odds}->{18} }
{qw(0 1 2)};
(
$match->{average_home_price}, $match->{average_draw_price},
$match->{average_away_price}, $match->{highest_home},
$match->{highest_draw}, $match->{highest_away}
)
= find_averages_1x2(
$data->{d}->{oddsdata}->{back}->{"E-1-2-0-0-0"}->{odds},
[
qw(
14
3
16
76
2
147
28
41
33
60
18
75
101
15
)
]
);
}
sub find_averages_ou {
my ( $bookmakers, $ids ) = @_;
my ( $avg_home, $avg_draw, $avg_away );
my ( $highest_home, $highest_draw, $highest_away );
my ( $sum_home, $sum_draw, $sum_away, $counter );
foreach my $id ( @{$ids} ) {
$sum_home += $bookmakers->{$id}->{0};
$sum_draw += $bookmakers->{$id}->{1};
$sum_away += $bookmakers->{$id}->{2};
unless ( defined $highest_home ) {
$highest_home = $bookmakers->{$id}->{0};
$highest_draw = $bookmakers->{$id}->{1};
$highest_away = $bookmakers->{$id}->{2};
}
else {
if ( $highest_home < $bookmakers->{$id}->{0} ) {
$highest_home = $bookmakers->{$id}->{0};
}
if ( $highest_draw < $bookmakers->{$id}->{1} ) {
$highest_draw = $bookmakers->{$id}->{1};
}
if ( $highest_away < $bookmakers->{$id}->{2} ) {
$highest_away = $bookmakers->{$id}->{2};
}
}
$counter++;
}
$avg_home = $sum_home / $counter;
$avg_draw = $sum_draw / $counter;
$avg_away = $sum_away / $counter;
return (
sprintf( "%0.2f", $avg_home ),
sprintf( "%0.2f", $avg_draw ),
sprintf( "%0.2f", $avg_away ),
$highest_home, $highest_draw, $highest_away
);
}
sub find_averages_1x2 {
my ( $bookmakers, $ids ) = @_;
my ( $avg_home, $avg_draw, $avg_away );
my ( $highest_home, $highest_draw, $highest_away );
my ( $sum_home, $sum_draw, $sum_away, $counter );
foreach my $id ( @{$ids} ) {
$sum_home += $bookmakers->{$id}->{0};
$sum_draw += $bookmakers->{$id}->{1};
$sum_away += $bookmakers->{$id}->{2};
unless ( defined $highest_home ) {
$highest_home = $bookmakers->{$id}->{0};
$highest_draw = $bookmakers->{$id}->{1};
$highest_away = $bookmakers->{$id}->{2};
}
else {
if ( $highest_home < $bookmakers->{$id}->{0} ) {
$highest_home = $bookmakers->{$id}->{0};
}
if ( $highest_draw < $bookmakers->{$id}->{1} ) {
$highest_draw = $bookmakers->{$id}->{1};
}
if ( $highest_away < $bookmakers->{$id}->{2} ) {
$highest_away = $bookmakers->{$id}->{2};
}
}
$counter++;
}
$avg_home = $sum_home / $counter;
$avg_draw = $sum_draw / $counter;
$avg_away = $sum_away / $counter;
return (
sprintf( "%0.2f", $avg_home ),
sprintf( "%0.2f", $avg_draw ),
sprintf( "%0.2f", $avg_away ),
$highest_home, $highest_draw, $highest_away
);
}
sub format_date {
my ($date) = shift;
my ( $day, $month, $year ) = $date =~ m{(\d{2})(\d{2})(\d{4})};
$date = join( "-", $year, $month, $day );
return $date;
}
sub find_matches {
my ( $mech, $start_date, $end_date ) = @_;
my @matches;
my ( $year, $day, $month ) = split /-/, $start_date;
my $season;
if ( $start_date lt $year . "-08-01" ) {
$season->{title} = join( "/", $year - 1, $year );
}
else {
$season->{title} = join( "/", $year, $year + 1 );
}
say "\t", $season->{title};
$mech->follow_link( text => $season->{title} );
process_season( $mech, $season );
my $range = Date::Range->new( Date::Simple->new($start_date),
Date::Simple->new($end_date) );
foreach my $date ( $range->dates ) {
if ( exists $season->{matches}->{$date} ) {
push @matches, @{ $season->{matches}->{$date} };
}
}
return @matches;
}
sub process_season {
my ( $mech, $season ) = @_;
say "\t\tCollecting season info...";
PARSE_RESULT_PAGE:
my $tree = HTML::TreeBuilder::XPath->new_from_content( $mech->content() );
my ($current_page) = $tree->findvalues('//span[@class = "active-page"]');
my ($last_page_url) =
$tree->findvalues('//div[ @id = "pagination"]/a[ last() ]/@href');
my ($next_page_url) =
$tree->findvalues('//div[ @id = "pagination"]/a[ last() -1 ]/@href');
my ($last_page) = $last_page_url =~ m{/(\d+)/$};
my $match_day;
foreach my $row (
$tree->findnodes('//table[ @id = "tournamentTable" ]/tbody/tr') )
{
next
unless ( ( $row->attr('class') eq "center nob-border" )
|| ( $row->attr('class') =~ m/deactivate/ ) );
if ( $row->attr('class') eq "center nob-border" ) {
my ($match_day_string) = $row->findvalues('./th[1]/span');
$match_day = convert_match_day($match_day_string);
}
else {
my $match = get_match_info($row);
$match->{match_date} = $match_day;
( $match->{home}, $match->{away} ) = split /\s+-\s+/,
$match->{title};
( $match->{home_team_goals}, $match->{away_team_goals} ) =
split /:/, $match->{score};
if ($match_day) {
push @{ $season->{matches}->{$match_day} }, $match;
}
}
}
while ( $current_page != $last_page ) {
$mech->get( "http://www.oddsportal.com" . $next_page_url );
goto PARSE_RESULT_PAGE;
}
$tree->delete();
say "\t\tDone!";
}
sub get_match_info {
my ($row) = shift;
my $match;
( $match->{start_time} ) = $row->findvalues('./td[1]');
( $match->{url} ) = $row->findvalues('./td[2]/a/@href');
$match->{url} = "http://www.oddsportal.com" . $match->{url};
( $match->{id} ) = $match->{url} =~ m{(\d+)/$};
( $match->{title} ) = $row->findvalues('./td[2]/a');
( $match->{score} ) = $row->findvalues('./td[3]');
return $match;
}
sub convert_match_day {
my ($text_date) = shift;
$text_date =~ s/\s+/ /g;
my $strp = DateTime::Format::Strptime->new(
pattern => '%d %B %Y',
locale => 'en_US',
on_error => 'croak',
);
my $dt = $strp->parse_datetime($text_date);
if ($dt) {
return $dt->ymd();
}
}