刚学node 写了个爬小说的爬虫 爬一定数量的小说以后就会 溢出 请帮忙看看是哪里的问题
爬取的顺序:
book主页url列表组成bookList ->
爬取首页信息和目录列表,插入对应的book对象中 ->
爬取book目录页的章节列表信息和章节的url插入对应的book对象中 ->
遍历bookList & 遍历book的章节列表url爬取章节内容插入对应的章节中,当一本书的内容全部插入完成后存成txt文件;这步用async.allLimit()做了限制,爬取完一本小说存为txt文件后,开始爬取下一本
好像我所有爬到的html内容都一直存在,没有被释放
const fs = require('fs');
const cheerio = require('cheerio');
const request = require('request');
const async = require('async')
const rp = require('request-promise')
const bookUrlSelector = '.two_main .main_con li .chap .fs14'
const bookImgSelector = '.main .book_cover img'
const bookName_MenuUrlSelector = '.main .status h1 a'
const bookAuthorSelector = '.main .status .booksub a[href*=userInfo]'
const bookTypeSelector = '.main .status .booksub a[href*=store]'
const bookDescSelector = '.main .status .info_con p'
const bookKeywordsSelector = '.main .status .keyword a'
const bookChapSelector = '.chapterBean a'
const bookChapContentSelector = '#readerFs p'
const bookContentPartSelector = '.reader_con h3'
let menuSpeed = 10
let chapsSpeed = 10
let bookSpeed = 1
let contentSpeed = 10
let bookList = []
let doneBookList = []
let pageStartCount = 1
let pageEndCount = 1
let bookIndex = 0
let menuIndex = 0
let chapsIndex = 0
let doneChap = 0
/**
* 发送request请求,获取请求url的html内容
*/
function getHtml(url, cb, item = null, callback = null, referer = 'http://book.zongheng.com') {
let config = {
headers: {
'Referer': referer,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
},
method: 'GET',
url: url,
};
rp(config)
.then(html => {
cb(html, item)
callback&&callback(null, 'success!!')
})
.catch(err => {
console.log('err-----------', url, err)
getHtml(url, cb, item, callback, referer)
})
}
/**
* 爬取纵横中文网站点的免费完结小说列表页,共20页
* @return {Array}: 返回全部列表页的arr
*/
function bookMainUrlList() {
let arr = []
for (let i = pageStartCount; i <= pageEndCount; i++) {
arr.push(`http://book.zongheng.com/quanben/c0/c0/b9/u1/p${i}/v0/s1/t0/ALL.html`)
}
return arr
}
/**
* 爬取所有列表页的url
*/
function initBookList() {
bookMainUrlList().map(url => {
getHtml(url, addBookMainUrl)
})
}
/**
* 解析获取book主页的地址
* @param html: page页的html
* @return {Array}: book主页url, 只返回纵横的
*/
function getBookUrlList(html) {
let $ = cheerio.load(html);
let bookUrlList = $(bookUrlSelector)
return (Array.from((bookUrlList)).filter(book => {
if (book.attribs.href.indexOf('baidu') === -1) {
return book
}
})).map(book => {
return book.attribs.href
})
}
/**
* 将从每个列表页爬取的 book的主页url 组成一个数组;创建bookList中book的mainUrl属性
* @param html: 列表页的html
*/
function addBookMainUrl(html) {
let bookMainUrl = []
bookMainUrl = bookMainUrl.concat(getBookUrlList(html))
bookMainUrl.forEach((url, index) => {
bookList.push({
mainUrl: url
})
})
bookIndex++
}
function limitBookBaseInfo() {
console.log(bookList.length)
async.allLimit(bookList, menuSpeed, (book, callback) => {
getHtml(book.mainUrl, getBookBaseInfo, book, callback)
}, (error, result) => {
if(error) {
console.log('出错啦!!!!!!!!!', error)
}
if (result) {
console.log('添加所有书的基本信息------------done~~~~~~~~^_^')
}
})
}
function getBookBaseInfo(html, book) {
let $ = cheerio.load(html);
let bookImg = $(bookImgSelector)
let bookName = $(bookName_MenuUrlSelector)
let bookAuthor = $(bookAuthorSelector)
let bookType = $(bookTypeSelector)
let bookDesc = $(bookDescSelector)
let bookKeywords = $(bookKeywordsSelector)
let info = {
sourceID: bookName[0].attribs.href.split('/').pop().split('.')[0],
image: bookImg[0].attribs.src,
name: bookName[0].children[0].data,
author: {
authorID: bookAuthor[0].attribs.href.split('/').pop().split('.')[0],
authorName: bookAuthor[0].children[0].data
},
type: bookType[0].children[0].data,
desc: bookDesc[0].children[0] ? bookType[0].children[0].data : '',
keywords: Array.from(bookKeywords).map(keyword => {
return keyword.children[0].data
}),
menuUrl: bookName[0].attribs.href
}
addBookBaseInfo(info, book)
}
function addBookBaseInfo(info, book) {
book.sourceID = info.sourceID
book.image = info.image
book.name = info.name
book.author = info.author
book.type = info.type
book.desc = info.desc
book.keywords = info.keywords
book.menuUrl = info.menuUrl
menuIndex++
console.log(menuIndex + '本书的基本信息添加完成')
}
function limitBookChaps() {
console.log('添加book的chaps', bookList.length)
async.allLimit(bookList, chapsSpeed, (book, callback) => {
getHtml(book.menuUrl, getBookChaps, book, callback)
}, (error, result) => {
if(error) {
console.log('出错啦!!!!!!!!!', error)
}
if (result) {
console.log('添加所有书的chaps------------done~~~~~~~~^_^')
}
})
}
function getBookChaps(html, book) {
let $ = cheerio.load(html)
let chapList = $(bookChapSelector)
book.chaps = (Array.from(chapList)).map((chap, index) => {
return {
chapID: index,
chapTitle: chap.children[0].data,
chapUrl: chap.attribs.href
}
})
chapList = null
console.log(`${book.sourceID} ${book.name} 的 ${book.chaps.length} 章添加完成`)
chapsIndex++
}
function removeEmptyBook() {
bookList = bookList.filter(book => {
if (book.chaps.length) {
return book
} else {
console.log(book.name, book.menuUrl, book.chaps)
console.log(`${book.sourceID} ${book.name} ${book.menuUrl} chaps为${book.chaps}没有内容----删除`)
}
})
}
function limitBookContent() {
console.log('添加chaps的content', bookList.length)
async.allLimit(bookList, bookSpeed, (book, callback) => {
limitChapsContent(book, callback)
}, (error, result) => {
if(error) {
console.log('出错啦!!!!!!!!!', error)
}
if (result) {
console.log('添加所有书的chaps------------done~~~~~~~~^_^')
}
})
}
function limitChapsContent(book, callback) {
async.allLimit(book.chaps, contentSpeed, (chap, cb) => {
getHtml(chap.chapUrl, getBookChapContent, chap, cb, book.menuUrl)
}, (error, result) => {
if (error) {
console.log('出错啦!!!!!!!!!', error)
}
if (result) {
console.log(`book: ${book.name}--已完成,准备创建txt文件`)
doneChap = 0
callback(null, 'success')
writeToTxt(book)
}
})
}
function getBookChapContent(html, chap) {
doneChap++
let $ = cheerio.load(html)
let contentList = $(bookChapContentSelector)
let contentPart = $(bookContentPartSelector)
chap.part = contentPart[0].children[0].data
chap.content = (Array.from(contentList)).map((parasContent) => {
return ' ' + parasContent.children[0].data
})
console.log(doneChap)
}
initBookList()
// 当爬取的页数index和页数的总count一致的时候,book的mainUrl爬取完毕等待下一次
let step_1 = function() {
console.log('wait.................................')
clearTimeout(timer1)
if (bookMainUrlList().length === bookIndex) {
clearTimeout(timer1)
console.log(`${bookList.length}本书的mainUrl爬取完毕`)
} else {
timer1 = setTimeout(step_1, 500)
}
}
let timer1 = setTimeout(step_1, 500)
let step_2 = function () {
// clearTimeout(timer2_2)
console.log('waiting..baseinfo.................................' + bookMainUrlList().length, bookIndex, menuIndex, bookList.length)
if (menuIndex === bookList.length) {
console.log(`${bookList.length}本书添加基本信息------------done~~~~~~~~^_^`)
} else {
timer2_2 = setTimeout(step_2, 5000)
}
}
let initBookListDone = function () {
clearTimeout(timer2_1)
if (bookMainUrlList().length === bookIndex) {
limitBookBaseInfo()
let timer2_2 = setTimeout(step_2, 5000)
} else {
timer2_1 = setTimeout(initBookListDone, 2000)
}
}
let timer2_1 = setTimeout(initBookListDone, 2000)
let step_3 = function () {
// clearTimeout(timer3_2)
console.log('waiting..bookchaps.................................' + bookList.length, chapsIndex)
if (chapsIndex === bookList.length) {
console.log(bookList.length)
console.log(`${bookList.length} 本书添加章节列表(id,title,url)------------done~~~~~~~~^_^`)
console.log('删除没有内容的book')
removeEmptyBook()
console.log(`删除内容为空的book后,共有书 ${bookList.length} 本`)
} else {
timer3_2 = setTimeout(step_3, 2000)
}
}
let addBookBaseInfoDone = function () {
clearTimeout(timer3_1)
// console.log(menuIndex, bookList)
if (menuIndex === bookList.length) {
console.log('==================================开始添加章节列表')
limitBookChaps()
let timer3_2 = setTimeout(step_3, 2000)
} else {
timer3_1 = setTimeout(addBookBaseInfoDone, 30000)
}
}
let timer3_1 = setTimeout(addBookBaseInfoDone, 30000)
let addBookChapsDone = function() {
console.log(`wait........... ${chapsIndex} ${bookList.length}`)
if (chapsIndex === bookList.length) {
limitBookContent()
} else {
timer4_1 = setTimeout(addBookChapsDone, 10000)
}
}
let timer4_1 = setTimeout(addBookChapsDone, 60000)
function writeToTxt(book) {
let str = ''
// if (book.chaps) {
for(let chap of book.chaps) {
str += chap.chapTitle + '\n'
if(chap.content) {
str += chap.content.join('\n') + '\n' + '\n'
}
}
// }
console.log("准备写入文件");
fs.writeFile(`./zhbooks/${book.name}.txt`, str, function(err) {
if (err) {
return console.error(err)
}
console.log("数据写入成功!")
})
}
我试了一下 node进程的内存涨到900多我就杀掉了 一直再涨 建议读一点写一点 不要都放在内存中