小杰平时会玩点体育彩票,所以想用大数据分析体育比赛趋势,于是有下面php代码。
主要 原理是用php抓取html页面的源码,然后用正则表达式抓取中间的数据,填入到数据库里面,
剩下的就是用mysql分析比赛数据了,废话不说,上代码:
下面是抓取310win网站的php代码:
<?php
class Lottery_game_infoController extends Controller
{
/**
* @var string the default layout for the views. Defaults to ‘//layouts/column2’, meaning
* using two-column layout. See ‘protected/views/layouts/column2.php’.
*/
public $layout=’//layouts/column2′;
/**
* @var CActiveRecord the currently loaded data model instance.
*/
private $_model;
/**
* @return array action filters
*/
public function filters()
{
return array(
‘accessControl’, // perform access control for CRUD operations
);
}
/**
* Specifies the access control rules.
* This method is used by the ‘accessControl’ filter.
* @return array access control rules
*/
public function accessRules()
{
return array(
array(‘allow’, // allow all users to perform ‘index’ and ‘view’ actions
‘actions’=>array(‘index’,’view’,’fetch_data’),
‘users’=>array(‘*’),
),
array(‘allow’, // allow authenticated user to perform ‘create’ and ‘update’ actions
‘actions’=>array(‘create’,’update’),
‘users’=>array(‘@’),
),
array(‘allow’, // allow admin user to perform ‘admin’ and ‘delete’ actions
‘actions’=>array(‘admin’,’delete’),
‘users’=>array(‘admin’),
),
array(‘deny’, // deny all users
‘users’=>array(‘*’),
),
);
}
/**
* Displays a particular model.
*/
public function actionView()
{
$this->render(‘view’,array(
‘model’=>$this->loadModel(),
));
}
/**
* Creates a new model.
* If creation is successful, the browser will be redirected to the ‘view’ page.
*/
public function actionCreate()
{
$model=new lottery_game_info;
// Uncomment the following line if AJAX validation is needed
// $this->performAjaxValidation($model);
if(isset($_POST[‘lottery_game_info’]))
{
$model->attributes=$_POST[‘lottery_game_info’];
if($model->save())
$this->redirect(array(‘view’,’id’=>$model->lottery_game_info_id));
}
$this->render(‘create’,array(
‘model’=>$model,
));
}
/**
* Updates a particular model.
* If update is successful, the browser will be redirected to the ‘view’ page.
*/
public function actionUpdate()
{
$model=$this->loadModel();
// Uncomment the following line if AJAX validation is needed
// $this->performAjaxValidation($model);
if(isset($_POST[‘lottery_game_info’]))
{
$model->attributes=$_POST[‘lottery_game_info’];
if($model->save())
$this->redirect(array(‘view’,’id’=>$model->lottery_game_info_id));
}
$this->render(‘update’,array(
‘model’=>$model,
));
}
/**
* Deletes a particular model.
* If deletion is successful, the browser will be redirected to the ‘index’ page.
*/
public function actionDelete()
{
if(Yii::app()->request->isPostRequest)
{
// we only allow deletion via POST request
$this->loadModel()->delete();
// if AJAX request (triggered by deletion via admin grid view), we should not redirect the browser
if(!isset($_GET[‘ajax’]))
$this->redirect(array(‘index’));
}
else
throw new CHttpException(400,’Invalid request. Please do not repeat this request again.’);
}
/**
* Lists all models.
*/
public function actionIndex()
{
$dataProvider=new CActiveDataProvider(‘lottery_game_info’);
$this->render(‘index’,array(
‘dataProvider’=>$dataProvider,
));
}
/**
* Manages all models.
*/
public function actionAdmin()
{
$model=new lottery_game_info(‘search’);
$model->unsetAttributes(); // clear any default values
if(isset($_GET[‘lottery_game_info’]))
$model->attributes=$_GET[‘lottery_game_info’];
$this->render(‘admin’,array(
‘model’=>$model,
));
}
/**
* Returns the data model based on the primary key given in the GET variable.
* If the data model is not found, an HTTP exception will be raised.
*/
public function loadModel()
{
if($this->_model===null)
{
if(isset($_GET[‘id’]))
$this->_model=lottery_game_info::model()->findbyPk($_GET[‘id’]);
if($this->_model===null)
throw new CHttpException(404,’The requested page does not exist.’);
}
return $this->_model;
}
/**
* Performs the AJAX validation.
* @param CModel the model to be validated
*/
protected function performAjaxValidation($model)
{
if(isset($_POST[‘ajax’]) && $_POST[‘ajax’]===’lottery-game-info-form’)
{
echo CActiveForm::validate($model);
Yii::app()->end();
}
}
/************************************************************************************
* 以下为抓取数据的action代码
**************************************************************************************/
/**
* Manages all models.
*/
public function actionFetch_data()
{
$url = ‘http://www.310win.com/jingcailanqiu/kaijiang_jclq_all.html’;
$this->render(‘fetch_data’,array(
‘model’=>$this->fetch_data($url),
// ‘model’=>$this->loadModel(),
));
}
function pp($argvs) {
echo ‘<div style=”text-align: left;”>’;
foreach ($argvs as $k => $v) {
echo “<xmp>”;
print_r($v);
echo “</xmp>”;
}
echo ‘</div>’;
}
/**
* 抓取win310数据
* ljzhou 2014-4-13
*/
public function fetch_data($url) {
$result = array();
// $result[‘url’] = $url;
// $result[‘data’] = $url;
$content = $this->get_http($url);
$patten = $this->get_patten();
// $lottory_menus =$this->get_word_single($content,$patten[‘lottory_menu’][‘start’],$patten[‘lottory_menu’][‘end’]);
// if(!empty($lottory_menus)) {
// $lottery_menu_details = $this->get_word_all($lottory_menus,$patten[‘lottory_menu_detail’][‘start’],$patten[‘lottory_menu_detail’][‘end’]);
// $result[‘lottery_menu’][0] = $lottery_menu_details;
// }
$lottery_container =$this->get_word_single($content,$patten[‘lottery_container’][‘start’],$patten[‘lottery_container’][‘end’]);
// print_r(‘$lottery_container’);
// print_r($lottery_container);
$lottery_games =$this->get_word_all($lottery_container,$patten[‘lottery_game’][‘start’],$patten[‘lottery_game’][‘end’]);
if(!empty($lottery_games)) {
foreach($lottery_games as $key=>$lottery_game) {
// 第0个是标题
if($key == 0) continue;
$lottery_game_details = $this->get_word_all($lottery_game,$patten[‘lottery_game_detail’][‘start’],$patten[‘lottery_game_detail’][‘end’]);
$lottery_game_details_new = array();
if(!empty($lottery_game_details)) {
foreach($lottery_game_details as $key2=>$lottery_game_detail) {
$lottery_game_details[$key2] = $this->preg_filter($lottery_game_detail);
}
// 星期
$preg = ‘/<td>(.*?)\d+/s’;
preg_match_all($preg,$lottery_game_details[0],$week_days);
$week_day=$week_days[1][0];
// 日期
$dates = explode(‘<br>’,$lottery_game_details[2]);
$date = $dates[0];
$pk_time = $dates[1];
// 分数
$scores = explode(‘-‘,$lottery_game_details[4]);
$home_score = $scores[0];
$visit_score = $scores[1];
$score_total = (int)$home_score + (int)$visit_score;
$lottery_game_details_new[‘site_code’] = ‘310win’;
$lottery_game_details_new[‘www_code’] = $url;
$lottery_game_details_new[‘lottery_code’] = $lottery_game_details[1];
$lottery_game_details_new[‘date’] = $date;
$lottery_game_details_new[‘pk_time’] = $pk_time;
$lottery_game_details_new[‘week_day’] = $week_day;
$lottery_game_details_new[‘home_team_name’] = $lottery_game_details[3];
$lottery_game_details_new[‘visit_team_name’] = $lottery_game_details[5];
$lottery_game_details_new[‘pk_score’] = $lottery_game_details[4];
$lottery_game_details_new[‘home_score’] = $home_score;
$lottery_game_details_new[‘visit_score’] = $visit_score;
$lottery_game_details_new[‘score_total’] = $score_total;
$lottery_game_details_new[‘the_point_position’] = $lottery_game_details[10];
$lottery_game_details_new[‘big_or_small’] = $lottery_game_details[11];
$lottery_game_details_new[‘point_spread’] = $lottery_game_details[7];
$lottery_game_details_new[‘team_win’] = $lottery_game_details[8];
$lottery_game_details_new[‘win_detail’] = $lottery_game_details[9];
$lottery_game_details_new[‘create_stamp’] = date(‘y-m-d h:i:s’,time());
$lottery_game_details_new[‘last_updated_stamp’] = date(‘y-m-d h:i:s’,time());
$result[‘lottery_game’][$key] = $lottery_game_details_new;
}
}
}
// $result[‘patten’] = $patten;
// $result[‘lottery_container’] = $lottery_container;
print_r(‘start pp’);
// print_r($result);
$this->pp($result);
print_r(‘end pp’);
// $this->pp($result[‘lottery_game’]);
// return;
foreach($result[‘lottery_game’] as $lottery_game_detail) {
$lottery_game_info = array(
‘site_code’ => $lottery_game_detail[‘site_code’],
‘www_code’ => $lottery_game_detail[‘www_code’],
‘lottery_code’ => $lottery_game_detail[‘lottery_code’],
‘date’ => $lottery_game_detail[‘date’],
‘pk_time’ => $lottery_game_detail[‘pk_time’],
‘week_day’ => $lottery_game_detail[‘week_day’],
‘home_team_name’ => $lottery_game_detail[‘home_team_name’],
‘visit_team_name’ => $lottery_game_detail[‘visit_team_name’],
‘pk_score’ => $lottery_game_detail[‘pk_score’],
‘home_score’ => $lottery_game_detail[‘home_score’],
‘visit_score’ => $lottery_game_detail[‘visit_score’],
‘score_total’ => $lottery_game_detail[‘score_total’],
‘the_point_position’ => $lottery_game_detail[‘the_point_position’],
‘big_or_small’ => $lottery_game_detail[‘big_or_small’],
‘point_spread’ => $lottery_game_detail[‘point_spread’],
‘team_win’ => $lottery_game_detail[‘team_win’],
‘win_detail’ => $lottery_game_detail[‘win_detail’],
‘create_stamp’ => $lottery_game_detail[‘create_stamp’],
‘last_updated_stamp’ => $lottery_game_detail[‘last_updated_stamp’],
);
print_r($lottery_game_info);
$model=new lottery_game_info;
$model->attributes=$lottery_game_info;
if(!$model->save()) {
echo ‘save false’;
};
}
return $result;
}
/**
* 正则过滤
*/
public function preg_filter($html) {
$pattens = $this->get_patten();
$preg_filters = $pattens[‘lottery_filter’];
// 每个规则都过滤一遍
foreach($preg_filters as $key=>$preg_filter) {
// print_r($key);
$result=$this->get_word_single($html,$preg_filter[‘start’],$preg_filter[‘end’]);
if($result) {
$html = $result;
}
}
return $html;
}
/**
* 抓取网页内容
*/
public function get_http($url) {
$result = file_get_contents($url);
return $result;
}
// 返回匹配的第一个结果
public function get_word_single($html,$star,$end,$pos=0){
$word = array();
$pattern3 = ‘/’.$star.'(.*?)’.$end.’/s’;
if(!preg_match_all($pattern3,$html,$match2)) {
// print_r(‘preg_match_all get_word_single false’);
return false;
}else{
$word= $match2[1][$pos];
}
return $word;
}
// 返回匹配的所有数组
public function get_word_all($html,$star,$end){
$word = array();
$pattern3 = ‘/’.$star.'(.*?)’.$end.’/s’;
if(!preg_match_all($pattern3,$html,$match2)) {
// print_r(‘preg_match_all get_word_all false’);
return false;
}else{
// print_r(‘get_word_all’);
// print_r($match2);echo ‘</br>’;
//var_dump(‘下面是全匹配match2[1][$pos]:’);echo ‘</br>’;
//var_dump($match2[1][$pos]);echo ‘</br>’;
//pp($match2[1][0]);
$word= $match2[1];
}
return $word;
}
public function get_patten() {
return array (
‘lottory_menu’ => array(
‘start’=>'<ul id=”mymenu”>’,
‘end’=>'<\/ul>’,
),
‘lottory_menu_detail’ => array(
‘start’=>’>’,
‘end’=>'<\/a>’,
),
‘lottery_container’ => array(
‘start’=>'<div id=”lottery_container”>’,
‘end’=>'<\/div>’,
),
‘lottery_game’ => array(
‘start’=>'<tr’,
‘end’=>'<\/tr>’,
),
‘lottery_game_detail’ => array(
‘start’=>’>’,
‘end’=>'<\/td>’,
),
‘lottery_filter’ => array(
‘tds’ => array(
‘start’=>’>’,
‘end’=>'<‘,
),
‘a’ => array(
‘start’=>’>’,
‘end’=>'<\/a>’,
),
‘span’ => array(
‘start’=>”,
‘end’=>'<\/span>’,
),
‘b’ => array(
‘start’=>’>’,
‘end’=>'<\/b>’,
),
‘u’ => array(
‘start’=>’>’,
‘end’=>'<\/u>’,
),
‘font’ => array(
‘start’=>’>’,
‘end’=>'<\/font>’,
),
‘u’ => array(
‘start’=>”,
‘end’=>'<\/u>’,
),
),
‘lottery_filter_bracket’ => array(
‘bracket’ => array(
‘start’=>'<‘,
‘end’=>’>’,
),
),
);
}
}
表结构的创建在 下面文章中 :【大数据】php代码抓取310win网站的体育比赛数据分析趋势(数据结构部分)