由于工作的原因,最近需要生成网站的sitemap.xml,谷歌百度了很多地方,没有发现并合适可用的代码,三思之后还是决定自己写吧!虽然可能写的有所缺陷,但是毕竟是认认真真写的,希望对一些后来者有所帮助......
1、为什么要自己写脚本生成sitemap.xml?
很多人会说,在网上有现成的工具,扫一下就可以了,没有必要自己写。是的,的确是这样的。但是假设我们的网站进行经常更新,那么是不是每次我都要手动更新sitemap呢。我很懒,那么,有没有更好的方案呢?肯定是有的,我是否可以起一个定时任务,每天晚上更新一次呢,此时脚本就有用武之地了
2、文档目录:
配置文件 - config/config.ini.php
sitemap主文件 - SiteMap.class.php
3、主文件代码
<?php
/**
* the script's main function is to help us to generate the target web's sitemap.xml file
*
* @category sitemap
* @author zero<maweibinguo@163.com>
* @version 1.0
*/
namespace Maweibinguo\SiteMap;
class SiteMap
{
const SCHEMA = 'http://www.sitemaps.org/schemas/sitemap/0.9';
/**
* @var webUrlList
* @access public
*/
public $webUrlList = array();
/**
* @var siteMapList
* @access public
*/
public $siteMapList = array();
/**
* @var isUseCookie
* @access public
*/
public $isUseCookie = false;
/**
* @var cookieFilePath
* @access public
*/
public $cookieFilePath = '';
/**
* @var xmlWriter
* @access private
*/
private $_xmlWriter = '';
/**
* init basic config
*
* @access public
*/
public function __construct()
{
$this->_xmlWriter = new \XMLWriter();
$result = $this->_enviromentTest();
}
/**
* test the enviroment for the script
*
* @access pirvate
*/
private function _enviromentTest()
{
$sapiType = \php_sapi_name ();
if( strtolower($sapiType) != 'cli' ) {
echo ' The Script Must Run In Command Lines ', "\r\n";
exit();
}
}
/**
* load the configValue for genrating sitemap by configname
*
* @param string $configName
* @return string $configValue
* @access public
*/
public function loadConfig($configName)
{
/* init return value */
$configValue = '';
/* load config value */
$configPath = __DIR__ . '/config/config.ini.php';
if(file_exists( $configPath )) {
require $configPath;
} else {
echo "Can not find config file", "\r\n";
exit();
}
$configValue = $$configName;
/* return config value */
return $configValue;
}
/**
* generate sitemap.xml for the web
*
* @param siteMapList
* @access public
*/
public function generateSiteMapXml($siteMapList)
{
/* init return result */
$result = false;
if( !is_array($siteMapList) || count($siteMapList) <= 0 ) {
echo 'The SiteMap Cotent Is Empty',"\r\n";
exit();
}
/* check the parameter */
$siteMapPath = $this->loadConfig('SITEMAPPATH');
if(!file_exists($siteMapPath)) {
$commandStr = "touch ${siteMapPath}";
exec($commandStr);
}
if( !is_writable($siteMapPath) ) {
echo 'Is Not Writeable',"\r\n";
exit();
}
$this->_xmlWriter->openURI($siteMapPath);
$this->_xmlWriter->startDocument('1.0', 'UTF-8');
$this->_xmlWriter->setIndent(true);
$this->_xmlWriter->startElement('urlset');
$this->_xmlWriter->writeAttribute('xmlns', self::SCHEMA);
foreach($siteMapList as $siteMapKey => $siteMapItem) {
$this->_xmlWriter->startElement('url');
$this->_xmlWriter->writeElement('loc',$siteMapItem['Url']);
$this->_xmlWriter->writeElement('title',$siteMapItem['Title']);
$changefreq = !empty($siteMapItem['ChangeFreq']) ? $siteMapItem['ChangeFreq'] : 'Daily';
$this->_xmlWriter->writeElement('changefreq',$changefreq);
$priority = !empty($siteMapItem['Priority']) ? $siteMapItem['Priority'] : 0.5;
$this->_xmlWriter->writeElement('priority',$priority);
$this->_xmlWriter->endElement();
}
$this->_xmlWriter->endElement();
/* return return */
return $result;
}
/**
* start to send request to the target url, and get the reponse
*
* @param string $targetUrl
* @return mixed $returnData
* @access public
*/
public function sendRequest($url)
{
/* init return value */
$responseData = false;
/* check the parameter */
if( !filter_var($url, FILTER_VALIDATE_URL) ) {
return $responseData;
}
$connectTimeOut = $this->loadConfig('CURLOPT_CONNECTTIMEOUT');
if( $connectTimeOut === false ) {
return $responseData;
}
$timeOut = $this->loadConfig('CURLOPT_TIMEOUT');
if( $timeOut === false ) {
return $responseData;
}
$handle = curl_init();
curl_setopt($handle, CURLOPT_URL, $url);
curl_setopt($handle, CURLOPT_HEADER, false);
curl_setopt($handle, CURLOPT_AUTOREFERER, true);
curl_setopt($handle, CURLOPT_RETURNTRANSFER , true);
curl_setopt($handle, CURLOPT_CONNECTTIMEOUT, $connectTimeOut);
curl_setopt($handle, CURLOPT_TIMEOUT, $timeOut);
curl_setopt($handle, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)" );
$headersItem = array( 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection: Keep-Alive' );
curl_setopt($handle, CURLOPT_HTTPHEADER, $headersItem);
curl_setopt($handle, CURLOPT_FOLLOWLOCATION, 1);
$cookieList = $this->loadConfig('COOKIELIST');
$isUseCookie = $cookieList['IsUseCookie'];
$cookieFilePath = $cookieList['CookiePath'];
if($isUseCookie) {
if(!file_exists($cookieFilePath)) {
$touchCommand = " touch {$cookieFilePath} ";
exec($touchCommand);
}
curl_setopt($handle, CURLOPT_COOKIEFILE, $cookieFilePath);
curl_setopt($handle, CURLOPT_COOKIEJAR, $cookieFilePath);
}
$responseData = curl_exec($handle);
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
if($httpCode != 200) {
$responseData = false;
}
curl_close($handle);
/* return response data */
return $responseData;
}
/**
* get the sitemap content of the url, it contains url, title, priority, changefreq
*
* @param string $url
* @access public
*/
public function generateSiteMapList($url)
{
$content = $this->sendRequest($url);
if($content !== false) {
$tagsList = $this->_parseContent($content, $url);
$urlItem = $tagsList['UrlItem'];
$title = $tagsList['Title'];
$siteMapItem = array( 'Url' => trim($url),
'Title' => trim($title) );
$priority = $this->_calculatePriority($siteMapItem['Url']);
$siteMapItem['Priority'] = $priority;
$changefreq = $this->_calculateChangefreq($siteMapItem['Url']);
$siteMapItem['ChangeFreq'] = $changefreq;
$this->siteMapList[] = $siteMapItem;
foreach($urlItem as $nextUrl) {
if( !in_array($nextUrl, $this->webUrlList) ) {
$skipUrlList = $this->loadConfig('SKIP_URLLIST');
foreach($skipUrlList as $keyWords) {
if( stripos($nextUrl, $keyWords) !== false ) {
continue 2;
}
}
$this->webUrlList[] = $nextUrl;
echo $nextUrl,"\r\n";
$this->generateSiteMapList($nextUrl);
}
}
}
}
/**
*teChangefreq get sitemaplist of the web
*
* @access public
* @return array $siteMapList
*/
public function getSiteMapList()
{
return $this->siteMapList;
}
/**
* calate the priority of the targeturl
*
* @param string $targetUrl
* @return float $priority
* @access private
*/
private function _calculatePriority($targetUrl)
{
/* init priority */
$priority = 0.5;
/* calculate the priority */
if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) {
$priorityList = $this->loadConfig('PRIORITYLIST');
foreach($priorityList as $priorityKey => $priorityValue) {
if(stripos($targetUrl, $priorityKey) !== false) {
$priority = $priorityValue;
break;
}
}
}
/* return priority */
return $priority;
}
/**
* calate the changefreq of the targeturl
*
* @param string $targetUrl
* @return float $changefreq
* @access private
*/
private function _calculateChangefreq($targetUrl)
{
/* init changefreq*/
$changefreq = 'Daily';
/* calculate the priority */
if( filter_var($targetUrl, FILTER_VALIDATE_URL) ) {
$changefreqList = $this->loadConfig('CHANGEFREQLIST');
foreach($changefreqList as $changefreqKey => $changefreqValue) {
if(stripos($targetUrl, $changefreqKey) !== false) {
$changefreq = $changefreqValue;
break;
}
}
}
/* return priority */
return $changefreq;
}
/**
* format url
*
* @param $url
* @param $orginUrl
* @access private
* @return $formatUrl
*/
private function _formatUrl($url, $originUrl)
{
/* init url */
$formatUrl = '';
/* format url */
if( !empty($url) && !empty($originUrl) ) {
$badUrlItem = array( '\\',
'/' ,
'javascript',
'javascript:;',
'' );
$formatUrl = trim($url);
$formatUrl = trim($formatUrl, '#');
$formatUrl = trim($formatUrl, '\'');
$formatUrl = trim($formatUrl, '"');
if(stripos($formatUrl, 'http') === false && !in_array($formatUrl, $badUrlItem)) {
if(strpos($formatUrl, '/') === 0) {
$domainName = $this->loadConfig('DOMAIN_NAME');
$formatUrl = $domainName . trim($formatUrl, '/');
} else {
$formatUrl = substr( $originUrl, 0, strrpos($originUrl, '/') ) .'/'. $formatUrl;
}
} elseif( stripos($formatUrl, 'http') === false && in_array($formatUrl, $badUrlItem) ) {
$formatUrl = '';
}
}
/* return url */
return $formatUrl;
}
/**
* check domain is right
*
* @param $url
* @return $url
* @access private
*/
private function _checkDomain($url)
{
/* init url */
$result = false;
/* check domain */
if($url) {
$domainName = $this->loadConfig('DOMAIN_NAME');
if( stripos($url, $domainName) === false ) {
return $result;
}
$result = true;
}
/* return url */
return $result;
}
/**
* parse the response content, so that we can get the urls
*
* @param string $content
* @param string $originUrl
* @return array $urlItem
* @access public
*/
public function _parseContent($content, $originUrl)
{
/* init return data */
$tagsList = array();
/* start parse */
if( !empty($content) && !empty($originUrl) ) {
$domainName = $this->loadConfig('DOMAIN_NAME');
/* get the attribute of href for tags <a> */
$regStrForTagA = '#<\s*a\s+href\s*=\s*(".*?"|\'.*?\')#um';
if( preg_match_all($regStrForTagA, $content, $matches) ) {
$urlItem = array_unique($matches[1]);
foreach($urlItem as $urlKey => $url) {
$formatUrl = $this->_formatUrl($url, $originUrl);
if( empty($formatUrl) ) {
unset($urlItem[$urlKey]);
continue;
}
$result = $this->_checkDomain($formatUrl);
if($result === false) {
unset($urlItem[$urlKey]);
continue;
}
$urlItem[$urlKey] = $formatUrl;
}
}
$tagsList['UrlItem'] = $urlItem;
/* get the title tags content */
$regStrForTitle = '#<\s*title\s*>(.*?)<\s*\/\s*title\s*>#um';
if( preg_match($regStrForTitle, $content, $matches) ) {
$title = $matches[1];
}
$tagsList['Title'] = $title;
}
/* return tagsList */
return $tagsList;
}
}
/* here is a example */
$startTime = microtime(true);
echo "/***********************************************************************/","\r\n";
echo "/* start to run {$startTime} */","\r\n";
echo "/***********************************************************************/","\r\n\r\n";
$siteMap = new SiteMap();
$domain = $siteMap->loadConfig('DOMAIN_NAME');
$siteMap->generateSiteMapList($domain);
$siteMapList = $siteMap->getSiteMapList();
$siteMap->generateSiteMapXml($siteMapList);
$endTime = microtime(true);
$takeTime = $endTime - $startTime;
echo "/***********************************************************************/","\r\n";
echo "/* Had Done, \t it total take {$takeTime} */","\r\n";
echo "/***********************************************************************/","\r\n";
?>
4、配置文件代码
<?php
//curl连接时间
$CURLOPT_CONNECTTIMEOUT = 5;
//curl请求超时时间
$CURLOPT_TIMEOUT = 10;
//域名
$DOMAIN_NAME = 'http://www.example.com/';
//设置跳过的地址关键字
$SKIP_URLLIST = array( 'addtocart' );
//设置cookie
$COOKIELIST = array( 'IsUseCookie' => true,
'CookiePath' => '/tmp/sitemapcookie' );
//sitemap文件的保存地址
$SITEMAPPATH = './sitemap.xml';
//根据连接关键字设置priority
$PRIORITYLIST = array( 'product' => '0.8',
'device' => '0.6',
'intelligent' => '0.4',
'course' => '0.2' );
//根据连接关键字设置CHANGEFREQ
$CHANGEFREQLIST = array( 'product' => 'Always',
'device' => 'Hourly',
'intelligent' => 'Daily',
'course' => 'Weekly',
'login' => 'Monthly',
'about' => 'Yearly' );
?>
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。