使用php扩展curl爬取百度热歌单曲

要求PHP扩展CURL
爬虫主要是运用正则技术

<?php
/*
抓取网站链接(http://music.baidu.com/tag/tagname),分析匹配对应的html内容,页面数据格式如下:  
<a href="http://music.baidu.com/song/121353608" target="_blank" class="" data-provider="" title="刘珂矣 半壶纱">半壶纱</a>  

之后生成php文件,格式为  
<?php  
return array();  
?>  
*/ 
class Fetch {

    function getData($url) {
        $data = array();
        $str = $this->http($url);
        if($str) {
            $data  = $this->parseHtml($str);
        }
        return $data;
    }

    function http($url) {
        //No.1
        //开始写代码,根据所给链接抓取网站内容
        // $curl = $url;
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  //将curl_exec()获取的信息以文件流的形式返回,而不是直接输出。
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);


        $output = curl_exec($ch);

        return $output;
        //end_code
    }

    function parseHtml($str) {
        
        $ids = array();  //百度歌曲id
        $titles = array(); //歌曲名
        $names = array(); //歌手名
        //No.2
        //开始写代码,解析页面内容,获得歌曲编号、歌曲名、艺人名字
        $pattern = '/href="\/song\/\d*/';
        preg_match_all($pattern, $str, $matches);
        foreach ($matches[0] as $v) {
            $ids[] = strtok($v, 'href="/song/');
        }

        $pattern = '/title="收藏\D+" href="#">/'; //歌曲名
        preg_match_all($pattern, $str, $matches);
        foreach ($matches[0] as $v) {
            $titles[] = strtr($v, array('title="收藏'=>'', '" href="#">'=>''));
        }

        $pattern = '/author_list" title="\D+">/'; //歌手名
        preg_match_all($pattern, $str, $matches);
        foreach ($matches[0] as $v) {
            $names[] = strtr($v, array('author_list" title="'=>'', '">'=>''));
        }

        //合并数组
        foreach ($ids as $key => $value) {
            $coalesce[$key]['id'] = $ids[$key];
            $coalesce[$key]['title'] = $titles[$key];
            $coalesce[$key]['name'] = $names[$key];
        }

        $url = array();
        foreach ($coalesce as $v) {
            $url[] = '<a href="http://music.baidu.com/song/'.$v['id'].'" target="_blank" class="" data-provider="" title="'.$v['name'].' '.$v['title'].'">'.$v['title'].'</a>';
        }
        return $url;
        //end_code
    }
}

$url = 'http://music.baidu.com/tag/%E7%83%AD%E6%AD%8C';
$fetch = new Fetch();
$data = $fetch->getData($url);

print_r($data);

dylucas
14 声望0 粉丝

« 上一篇
编译PHP扩展