星座屋(http://www.xzw.com/fortune/)运势界面:
最终爬取数据结果展示在APP上的效果:
下面就是使用正则实现的代码,是自己一年多前花了半天时间写的。现在想来,如果使用Scrapy或者phpspider只用几行代码就搞定了,不用这么费力气了~
<?php
/**
* 星座运势
* author: pengfei
* http://www.xzw.com/fortune/aries/ 今日
* http://www.xzw.com/fortune/aries/1.html 明日
* http://www.xzw.com/fortune/aries/2.html 本周
* http://www.xzw.com/fortune/aries/3.html 本月
* http://www.xzw.com/fortune/aries/4.html 今年
* http://www.xzw.com/fortune/aries/5.html 爱情
*/
define('IN_FTE', true);
require(dirname(__FILE__) . '/includes/init.php');
date_default_timezone_set('Asia/Shanghai');
$json = new JSON();
/*
$constellation = array (
'白羊座' => array('aries', '03/21-04/19'),
'金牛座' => array('taurus', '04/20-05/20'),
'双子座' => array('gemini', '05/21-06/21'),
'巨蟹座' => array('cancer', '06/22-07/22'),
'狮子座' => array('leo', '07/23-08/22'),
'处女座' => array('virgo', '08/23-09/22'),
'天秤座' => array('libra', '09/23-10/23'),
'天蝎座' => array('scorpio', '10/24-11/22'),
'射手座' => array('sagittarius', '11/23-12/21'),
'魔羯座' => array('capricorn', '12/22-01/19'),
'水瓶座' => array('aquarius', '01/20-02/18'),
'双鱼座' => array('pisces', '02/19-03/20')
);*/
$constellation = isset($_REQUEST['xingzuo']) && !empty($_REQUEST['xingzuo']) ? trim($_REQUEST['xingzuo']) : null;
$category = isset($_REQUEST['category']) && !empty($_REQUEST['category']) ? intval($_REQUEST['category']) : null;
$all_xingzuo = array(
'aries',
'taurus',
'gemini',
'cancer',
'leo',
'virgo',
'libra',
'scorpio',
'sagittarius',
'capricorn',
'aquarius',
'pisces'
);
$all_category = array(0,1,2,3,4,5);
if(!in_array($constellation, $all_xingzuo) || !in_array($category, $all_category)){
exit('Params error');
}
$domain = 'http://www.xzw.com/fortune/';
$apiUrl = '';
if($category){
$apiUrl = $domain.$constellation.'/'.$category.'.html';
} else {
$apiUrl = $domain.$constellation.'/';
}
header("Content-type: text/html; charset=utf-8");
function getFortuneData($url){
$fortune_data = array();
$data = file_get_contents($url);
$data = mb_convert_encoding($data, 'utf-8', 'gbk');
preg_match('/<div class="c_main">(.*)<\/div>/ism', $data, $div_c_main);
preg_match('/<dl>(.*?)<\/dl>/ism', $div_c_main[1], $dl);
preg_match('/<dd>(.*?)<\/dd>/ism', $dl[1], $dd);
preg_match('/<ul>(.*?)<\/ul>/ism', $dd[1], $ul);
$ul = str_replace('<label>', '{label}', $ul[1]);
$ul = preg_replace('/<span[^>]*?>/ism', '', $ul);
$ul = preg_replace('/<li[^>]*>/ism', '', $ul);
$ul = preg_replace('/<\/label>/ism', '', $ul);
$ul = preg_replace('/<\/li>/ism', '', $ul);
//$ul = preg_replace('/\s+/','',$ul);
$ul_arr = explode('{label}', $ul);
array_shift($ul_arr);
foreach ($ul_arr as $key => &$li) {
//preg_match_all("/([\x81-\xfe][\x40-\xfe])+/", $li, $matches);//转换编码 $matches[1][0]表示":"
preg_match('/<em style="width:(\d{1,}).*">/ism', $li, $width);
if(!empty($width)){
$li = explode(":",$li);
$li['label'] = preg_replace('/<em[^>]*?>/ism', '', $li[0]);
$li['value'] = sprintf('%0.2f', floatval($width[1]/80));
unset($li[0]);
unset($li[1]);
//$val[1] = $width[1]/16;
} else {
$li = explode(":",$li);
$li['label'] = $li[0];
$li['value'] = $li[1];
unset($li[0]);
unset($li[1]);
}
}
$fortune_data['ul'] = $ul_arr;
//获取c_cont
preg_match('/<div class="c_cont">(.*?)<\/div>/ism', $data, $cont);
$p_cont = preg_replace('/<strong[^>]*?>/ism', '', $cont[1]);
$p_cont = str_replace('<span>', '{span}', $p_cont);
$p_cont = str_replace('<p>', '{p}', $p_cont);
$p_cont = preg_replace("'<[/!]*?[^<>]*?>'si","",$p_cont);
$p_cont = preg_replace('/\s+/','',$p_cont);
//$p_cont = preg_replace("'([rn])[s]+'","",$p_cont);
$p_cont = str_replace('<div class="z">', '', $p_cont);
$p_cont_arr = explode('{p}',$p_cont);
array_shift($p_cont_arr);
foreach ($p_cont_arr as $key => $val) {
$temp = explode('{span}', $val);
$temp_arr['label'] = $temp[0];
$temp_arr['value'] = $temp[1];
$fortune_data['cont'][] = $temp_arr;
unset($temp);
}
return $fortune_data;
}
$write_result = '';//写入状态 默认为空表示不写入
$local_data = '';
$result = array();
$fileName = !empty($category) ? $constellation.'-'.$category.'.php' : $constellation.'.php';
$fortune_data_path = 'fortune_data/'.$fileName;
if(file_exists($fortune_data_path)){
$local_data = @file_get_contents('fortune_data/'.$fileName);
}
if (!empty($local_data)) {
$filemtime = filemtime($fortune_data_path);
//判断缓存时间是否在当天内产生
$todayStart = mktime(0, 0, 0, date("m"), date("d"), date("Y"));
if($filemtime < $todayStart){ //缓存过期
$data = getFortuneData($apiUrl);
$write_result = write_fortune_cache($data, $fileName);
if(empty($data)){
$result['result'] = -1;
$result['msg'] = '数据抓取失败!';
$result['write_result'] = $write_result;
$result['data'] = array();
exit($json->encode($result));
}
} else {
$data = unserialize($local_data);
}
$result['result'] = 0;
$result['msg'] = 'success';
$result['write_result'] = $write_result;
$result['data'] = $data;
exit($json->encode($result));
} else {
$data = getFortuneData($apiUrl);
if(!empty($data)){
$write_result = write_fortune_cache($data, $fileName);
$result['result'] = 0;
$result['msg'] = 'success';
$result['write_result'] = $write_result;
$result['data'] = $data;
exit($json->encode($result));
} else {
$result['result'] = -1;
$result['msg'] = '数据抓取失败!';
$result['write_result'] = $write_result;
$result['data'] = array();
exit($json->encode($result));
}
}
function write_fortune_cache($data, $fileName){
$fp = fopen('./fortune_data/'.$fileName, 'w+') or die('fortune_data/'.$fileName.'不存在!');
$fw = fwrite($fp, serialize($data));
if($fw){
$write_result = 'success';
} else {
$write_result = 'fail';
}
fclose($fp);
return $write_result;
}
?>
End
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。