2

由于工作的原因,最近需要生成网站的sitemap.xml,谷歌百度了很多地方,没有发现并合适可用的代码,三思之后还是决定自己写吧!虽然可能写的有所缺陷,但是毕竟是认认真真写的,希望对一些后来者有所帮助......

1、为什么要自己写脚本生成sitemap.xml?

很多人会说,在网上有现成的工具,扫一下就可以了,没有必要自己写。是的,的确是这样的。但是假设我们的网站进行经常更新,那么是不是每次我都要手动更新sitemap呢。我很懒,那么,有没有更好的方案呢?肯定是有的,我是否可以起一个定时任务,每天晚上更新一次呢,此时脚本就有用武之地了

2、文档目录:

    配置文件 - config/config.ini.php
    sitemap主文件 - SiteMap.class.php

3、主文件代码

<?php
    /**
     * the script's main function is to help us to generate the target web's sitemap.xml file 
     *
     * @category sitemap
     * @author zero<maweibinguo@163.com>
     * @version 1.0
     */
    namespace Maweibinguo\SiteMap;
    class SiteMap
    {
        const SCHEMA = 'http://www.sitemaps.org/schemas/sitemap/0.9';

        /**
         * @var webUrlList
         * @access public
         */
        public $webUrlList = array();

        /**
         * @var siteMapList
         * @access public
         */
        public $siteMapList = array();

        /**
         * @var isUseCookie
         * @access public
         */
        public $isUseCookie = false;

        /**
         * @var cookieFilePath
         * @access public
         */
        public $cookieFilePath = '';

        /**
         * @var xmlWriter
         * @access private
         */
        private $_xmlWriter = '';

        /**
         * init basic config
         *
         * @access public
         */
        public function __construct()
        {
            $this->_xmlWriter = new \XMLWriter();

            $result = $this->_enviromentTest();
        }

        /**
         * test the enviroment for the script 
         *
         * @access pirvate
         */
        private function _enviromentTest()
        {
            $sapiType = \php_sapi_name ();
            if( strtolower($sapiType) != 'cli' ) {
                echo ' The Script Must Run In Command Lines ', "\r\n";
                   exit();    
            }
        }

        /**
         * load the configValue for genrating sitemap by configname
         *
         * @param string $configName
         * @return string $configValue
         * @access public
         */
        public function loadConfig($configName)
        {
            /* init return value */
            $configValue = '';

            /* load config value */
            $configPath = __DIR__ . '/config/config.ini.php';
            if(file_exists( $configPath )) {
                require $configPath;
            } else {
                echo "Can not find config file", "\r\n";
                exit();    
            }
            $configValue = $$configName;

            /* return config value */
            return $configValue;
        }

        /**
         * generate sitemap.xml for the web
         *
         * @param siteMapList
         * @access public
         */
        public function generateSiteMapXml($siteMapList)
        {
            /* init return result */
            $result = false;
            if( !is_array($siteMapList) || count($siteMapList) <= 0 ) {
                echo 'The SiteMap Cotent Is Empty',"\r\n";
                exit();
            }

            /* check the parameter */
            $siteMapPath = $this->loadConfig('SITEMAPPATH');
            if(!file_exists($siteMapPath)) {
                $commandStr = "touch ${siteMapPath}";
                exec($commandStr);
            }
            if( !is_writable($siteMapPath) ) {
                echo 'Is Not Writeable',"\r\n";
                exit();
            }
            $this->_xmlWriter->openURI($siteMapPath);
            $this->_xmlWriter->startDocument('1.0', 'UTF-8');
            $this->_xmlWriter->setIndent(true);
            $this->_xmlWriter->startElement('urlset');
            $this->_xmlWriter->writeAttribute('xmlns', self::SCHEMA);
            foreach($siteMapList as $siteMapKey => $siteMapItem) {
                $this->_xmlWriter->startElement('url');
                $this->_xmlWriter->writeElement('loc',$siteMapItem['Url']);
                $this->_xmlWriter->writeElement('title',$siteMapItem['Title']);
                $changefreq = !empty($siteMapItem['ChangeFreq']) ? $siteMapItem['ChangeFreq'] : 'Daily';
                $this->_xmlWriter->writeElement('changefreq',$changefreq);
                $priority = !empty($siteMapItem['Priority']) ? $siteMapItem['Priority'] : 0.5;
                $this->_xmlWriter->writeElement('priority',$priority);
                $this->_xmlWriter->endElement();
            }
            $this->_xmlWriter->endElement();

            /* return return */
            return $result;
        }

        /**
         * start to send request to the target url, and get the reponse 
         *
         * @param string $targetUrl
         * @return mixed $returnData 
         * @access public
         */
        public function sendRequest($url)
        {
            /* init return value */
            $responseData = false;

            /* check the parameter */
            if( !filter_var($url, FILTER_VALIDATE_URL) ) {
                return $responseData;
            }
            $connectTimeOut = $this->loadConfig('CURLOPT_CONNECTTIMEOUT');
            if( $connectTimeOut === false ) {
                return $responseData;
            }
            $timeOut = $this->loadConfig('CURLOPT_TIMEOUT');
            if( $timeOut === false ) {
                return $responseData;
            }

            $handle = curl_init();
            curl_setopt($handle, CURLOPT_URL, $url);
            curl_setopt($handle, CURLOPT_HEADER, false);
            curl_setopt($handle, CURLOPT_AUTOREFERER, true);
            curl_setopt($handle, CURLOPT_RETURNTRANSFER , true);
            curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, $connectTimeOut);
            curl_setopt($handle, CURLOPT_TIMEOUT, $timeOut);
            curl_setopt($handle, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)" );
            $headersItem = array(    'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                    'Connection: Keep-Alive'     );
            curl_setopt($handle, CURLOPT_HTTPHEADER, $headersItem);
            curl_setopt($handle, CURLOPT_FOLLOWLOCATION, 1);

            $cookieList = $this->loadConfig('COOKIELIST');
            $isUseCookie = $cookieList['IsUseCookie'];
            $cookieFilePath = $cookieList['CookiePath'];
            if($isUseCookie) {
                if(!file_exists($cookieFilePath)) {
                    $touchCommand = " touch {$cookieFilePath} ";
                    exec($touchCommand);
                }
                curl_setopt($handle, CURLOPT_COOKIEFILE, $cookieFilePath);
                curl_setopt($handle, CURLOPT_COOKIEJAR, $cookieFilePath);
            }
            $responseData = curl_exec($handle);
            $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
            if($httpCode != 200) {
                $responseData = false;
            }
            curl_close($handle);

            /* return response data */
            return $responseData;
        }

        /**
         * get the sitemap content of the url, it contains url, title, priority, changefreq
         *
         * @param string $url 
         * @access public
         */
        public function generateSiteMapList($url)
        {
            $content = $this->sendRequest($url);

            if($content !== false) {
                $tagsList = $this->_parseContent($content, $url);
                $urlItem = $tagsList['UrlItem'];
                $title = $tagsList['Title'];

                $siteMapItem = array(    'Url' => trim($url),
                                        'Title' => trim($title)    );
                $priority = $this->_calculatePriority($siteMapItem['Url']);
                $siteMapItem['Priority'] = $priority;
                $changefreq = $this->_calculateChangefreq($siteMapItem['Url']);
                $siteMapItem['ChangeFreq'] = $changefreq;

                $this->siteMapList[] = $siteMapItem;            
                foreach($urlItem as $nextUrl) {
                    if( !in_array($nextUrl, $this->webUrlList) ) {
                        $skipUrlList = $this->loadConfig('SKIP_URLLIST');
                        foreach($skipUrlList as $keyWords) {
                            if( stripos($nextUrl, $keyWords) !== false ) {
                                continue 2;
                            }
                        }
                        $this->webUrlList[] = $nextUrl;
                        echo $nextUrl,"\r\n";
                        $this->generateSiteMapList($nextUrl);
                    }
                }
            }
        }

        /**
         *teChangefreq get sitemaplist of the web
         *
         * @access public
         * @return array $siteMapList
         */
        public function getSiteMapList()
        {
            return $this->siteMapList;
        }

        /**
         * calate the priority of the targeturl
         *
         * @param string $targetUrl
         * @return float $priority
         * @access private
         */
        private function _calculatePriority($targetUrl)
        {
            /* init priority */
            $priority = 0.5;

            /* calculate the priority */
            if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) {
                $priorityList = $this->loadConfig('PRIORITYLIST');
                foreach($priorityList as $priorityKey => $priorityValue) {
                    if(stripos($targetUrl, $priorityKey) !== false) {
                        $priority = $priorityValue;
                        break;
                    }
                }
            }

            /* return priority */
            return $priority;
        }

        /**
         * calate the changefreq of the targeturl
         *
         * @param string $targetUrl
         * @return float $changefreq
         * @access private
         */
        private function _calculateChangefreq($targetUrl)
        {
            /* init changefreq*/
            $changefreq = 'Daily';

            /* calculate the priority */
            if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) {
                $changefreqList = $this->loadConfig('CHANGEFREQLIST');
                foreach($changefreqList as $changefreqKey => $changefreqValue) {
                    if(stripos($targetUrl, $changefreqKey) !== false) {
                        $changefreq = $changefreqValue;
                        break;
                    }
                }
            }

            /* return priority */
            return $changefreq;
        }

        /**
         * format url 
         * 
         * @param $url
         * @param $orginUrl
         * @access private
         * @return $formatUrl
         */
        private function _formatUrl($url, $originUrl)
        {
            /* init url */
            $formatUrl = '';

            /* format url */
            if( !empty($url) && !empty($originUrl) ) {
                $badUrlItem = array(    '\\', 
                                        '/' , 
                                        'javascript',
                                        'javascript:;',
                                        ''    );
                $formatUrl = trim($url);
                   $formatUrl = trim($formatUrl, '#');
                $formatUrl = trim($formatUrl, '\'');
                $formatUrl = trim($formatUrl, '"');
                if(stripos($formatUrl, 'http') === false && !in_array($formatUrl, $badUrlItem)) {
                    if(strpos($formatUrl, '/') === 0) {
                        $domainName = $this->loadConfig('DOMAIN_NAME');    
                        $formatUrl = $domainName . trim($formatUrl, '/');
                    } else {
                        $formatUrl = substr( $originUrl, 0, strrpos($originUrl, '/') ) .'/'. $formatUrl;
                    }
                } elseif( stripos($formatUrl, 'http') === false && in_array($formatUrl, $badUrlItem) ) {
                    $formatUrl = '';
                }
            }

            /* return url */
            return $formatUrl;
        }

        /**
         * check domain is right
         * 
         * @param $url
         * @return $url
         * @access private
         */
        private function _checkDomain($url)
        {
            /* init url */
            $result = false;

            /* check domain */
            if($url) {
                $domainName = $this->loadConfig('DOMAIN_NAME');
                if( stripos($url, $domainName) === false ) {
                    return $result;
                }
                $result = true;
            }
        
            /* return url */
            return $result;
        }

        /**
         * parse the response content, so that we can get the urls
         *
         * @param string $content
         * @param string $originUrl
         * @return array $urlItem
         * @access public
         */
        public function _parseContent($content, $originUrl)
        {
            /* init return data */
            $tagsList = array();

            /* start parse */
            if( !empty($content) && !empty($originUrl) ) {
                $domainName = $this->loadConfig('DOMAIN_NAME');

                /* get the attribute of href for tags <a> */
                $regStrForTagA = '#<\s*a\s+href\s*=\s*(".*?"|\'.*?\')#um';
                if( preg_match_all($regStrForTagA, $content, $matches) ) {
                    $urlItem = array_unique($matches[1]);
                    foreach($urlItem as $urlKey => $url) {
                        $formatUrl = $this->_formatUrl($url, $originUrl);
                        if( empty($formatUrl) ) {
                            unset($urlItem[$urlKey]);
                            continue;
                        }

                        $result = $this->_checkDomain($formatUrl);
                        if($result === false) {
                            unset($urlItem[$urlKey]);
                            continue;
                        }
                        $urlItem[$urlKey] = $formatUrl;
                    }
                }

                $tagsList['UrlItem'] = $urlItem;

                /* get the title tags content */
                $regStrForTitle = '#<\s*title\s*>(.*?)<\s*\/\s*title\s*>#um';
                if( preg_match($regStrForTitle, $content, $matches) ) {
                    $title = $matches[1];    
                }
                $tagsList['Title'] = $title;

            }

            /* return tagsList */
            return $tagsList;
        }
    }

    /* here is a example */

    $startTime = microtime(true);
    echo "/***********************************************************************/","\r\n";
    echo "/*                    start to run {$startTime}                        */","\r\n";
    echo "/***********************************************************************/","\r\n\r\n";

    $siteMap = new SiteMap();
    $domain = $siteMap->loadConfig('DOMAIN_NAME');
    $siteMap->generateSiteMapList($domain);
    $siteMapList = $siteMap->getSiteMapList();
    $siteMap->generateSiteMapXml($siteMapList);

    $endTime = microtime(true);
    $takeTime = $endTime - $startTime;
    echo "/***********************************************************************/","\r\n";
    echo "/*               Had Done, \t it total take {$takeTime}      */","\r\n";
    echo "/***********************************************************************/","\r\n";
?> 

4、配置文件代码

<?php
    //curl连接时间
    $CURLOPT_CONNECTTIMEOUT = 5;

    //curl请求超时时间
    $CURLOPT_TIMEOUT = 10;

    //域名
    $DOMAIN_NAME = 'http://www.example.com/';

    //设置跳过的地址关键字
    $SKIP_URLLIST = array(    'addtocart'    );

    //设置cookie
    $COOKIELIST = array(    'IsUseCookie' => true,
                            'CookiePath' => '/tmp/sitemapcookie'    );

    //sitemap文件的保存地址
    $SITEMAPPATH = './sitemap.xml';

    //根据连接关键字设置priority
    $PRIORITYLIST = array(    'product' => '0.8',
                            'device' => '0.6',
                            'intelligent' => '0.4',
                            'course' => '0.2'    );

    //根据连接关键字设置CHANGEFREQ
    $CHANGEFREQLIST = array(    'product' => 'Always',
                                'device' => 'Hourly',
                                'intelligent' => 'Daily',
                                'course' => 'Weekly',
                                'login' => 'Monthly',
                                'about' => 'Yearly'    );

?>

5、获取源码包

单击下载源代码 (提取码:fc1c)


maweibinguo
783 声望36 粉丝

后端开发工程师一枚, keep moving