PHP可以写网页爬虫吗 ？原理是什么？

爬虫原理都是一样的，一般用python、node比较方便，下面是观点网流出来的知乎爬虫，你可以看下:
#coding:utf-8
"""
@author:haoning
@create time:2015.8.5
"""
from __future__ import division  # 精确除法
from Queue import Queue
from __builtin__ import False
import json
import os
import re
import platform
import uuid
import urllib
import urllib2
import sys
import time
import MySQLdb as mdb
from bs4 import BeautifulSoup
 
reload(sys)
sys.setdefaultencoding( "utf-8" )
 
headers = {
   'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',
   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
   'X-Requested-With':'XMLHttpRequest',
   'Referer':'https://www.zhihu.com/topics',
   'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'
}
 
DB_HOST = '127.0.0.1'
DB_USER = 'root'
DB_PASS = 'root'
 
queue= Queue() #接收队列
nodeSet=set()
keywordSet=set()
stop=0
offset=-20
level=0
maxLevel=7
counter=0
base=""
 
conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')
conn.autocommit(False)
curr = conn.cursor()
 
def get_html(url):
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req,None,3) #在这里应该加入代理
        html = response.read()
        return html
    except:
        pass
    return None
 
def getTopics():
    url = 'https://www.zhihu.com/topics'
    print url
    try:
        req = urllib2.Request(url)
        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�
        html = response.read().decode('utf-8')
        print html
        soup = BeautifulSoup(html)
        lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})
         
        for li in lis:
            data_id=li.get('data-id')
            name=li.text
            curr.execute('select id from classify_new where name=%s',(name))
            y= curr.fetchone()
            if not y:
                curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))
        conn.commit()
    except Exception as e:
        print "get topic error",e
         
 
def get_extension(name): 
    where=name.rfind('.')
    if where!=-1:
        return name[where:len(name)]
    return None
 
 
def which_platform():
    sys_str = platform.system()
    return sys_str
 
def GetDateString():
    when=time.strftime('%Y-%m-%d',time.localtime(time.time()))
    foldername = str(when)
    return foldername
 
def makeDateFolder(par,classify):
    try:
        if os.path.isdir(par):
            newFolderName=par + '//' + GetDateString() + '//'  +str(classify)
            if which_platform()=="Linux":
                newFolderName=par + '/' + GetDateString() + "/" +str(classify)
            if not os.path.isdir( newFolderName ):
                os.makedirs( newFolderName )
            return newFolderName
        else:
            return None
    except Exception,e:
        print "kk",e
    return None
 
def download_img(url,classify):
    try:
        extention=get_extension(url)
        if(extention is None):
            return None
        req = urllib2.Request(url)
        resp = urllib2.urlopen(req,None,3)
        dataimg=resp.read()
        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention
        top="E://topic_pic"
        folder=makeDateFolder(top, classify)
        filename=None
        if folder is not None:
            filename  =folder+"//"+name
        try:
            if "e82bab09c_m" in str(url):
                return True
            if not os.path.exists(filename):
                file_object = open(filename,'w+b')
                file_object.write(dataimg)
                file_object.close()
                return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name
            else:
                print "file exist"
                return None
        except IOError,e1:
            print "e1=",e1
            pass
    except Exception as e:
        print "eee",e
        pass
    return None #如果没有下载下来就利用原来网站的链接
 
def getChildren(node,name):
    global queue,nodeSet
    try:
        url="https://www.zhihu.com/topic/"+str(node)+"/hot"
        html=get_html(url)
        if html is None:
            return
        soup = BeautifulSoup(html)
        p_ch='父话题'
        node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
        topic_cla=soup.find('div', {'class' : 'child-topic'})
        if topic_cla is not None:
            try:
                p_ch=str(topic_cla.text)
                aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点
                if u'子话题' in p_ch:
                    for a in aList:
                        token=a.get('data-token')
                        a=str(a).replace('\n','').replace('\t','').replace('\r','')
                        start=str(a).find('>')
                        end=str(a).rfind('</a>')
                        new_node=str(str(a)[start+1:end])
                        curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同
                        y= curr.fetchone()
                        if not y:
                            print "y=",y,"new_node=",new_node,"token=",token
                            queue.put((token,new_node,node_name))
            except Exception as e:
                print "add queue error",e
    except Exception as e:
        print "get html error",e
         
     
 
def getContent(n,name,p,top_id):
    try:
        global counter
        curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
        y= curr.fetchone()
        print "exist?? ",y,"n=",n
        if not y:
            url="https://www.zhihu.com/topic/"+str(n)+"/hot"
            html=get_html(url)
            if html is None:
                return
            soup = BeautifulSoup(html)
            title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text
            pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')
            description=soup.find('div',{'class':'zm-editable-content'})
            if description is not None:
                description=description.text
                 
            if (u"未归类" in title or u"根话题" in title): #允许入库，避免死循环
                description=None
                 
            tag_path=download_img(pic_path,top_id)
            print "tag_path=",tag_path
            if (tag_path is not None) or tag_path==True:
                if tag_path==True:
                    tag_path=None
                father_id=2 #默认为杂谈
                curr.execute('select id from rooms where name=%s',(p))
                results = curr.fetchall()
                for r in results:
                    father_id=r[0]
                name=title
                curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同
                y= curr.fetchone()
                print "store see..",y
                if not y:
                    friends_num=0
                    temp = time.time()
                    x = time.localtime(float(temp))
                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now
                    create_time
                    creater_id=None
                    room_avatar=tag_path
                    is_pass=1
                    has_index=0
                    reason_id=None 
                    #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id
                    ######################有资格入库的内容
                    counter=counter+1
                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))
                    conn.commit() #必须时时进入数据库，不然找不到父节点
                    if counter % 200==0:
                        print "current node",name,"num",counter
    except Exception as e:
        print "get content error",e      
 
def work():
    global queue
    curr.execute('select id,node,parent,name from classify where status=1')
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        node=r[1]
        parent=r[2]
        name=r[3]
        try:
            queue.put((node,name,parent)) #首先放入队列
            while queue.qsize() >0:
                n,p=queue.get() #顶节点出队
                getContent(n,p,top_id)
                getChildren(n,name) #出队内容的子节点
            conn.commit()
        except Exception as e:
            print "what's wrong",e 
             
def new_work():
    global queue
    curr.execute('select id,data_id,name from classify_new_copy where status=1')
    results = curr.fetchall()
    for r in results:
        top_id=r[0]
        data_id=r[1]
        name=r[2]
        try:
            get_topis(data_id,name,top_id)
        except:
            pass
 
 
def get_topis(data_id,name,top_id):
    global queue
    url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'
    isGet = True;
    offset = -20;
    data_id=str(data_id)
    while isGet:
        offset = offset + 20
        values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}
        try:
            msg=None
            try:
                data = urllib.urlencode(values)
                request = urllib2.Request(url,data,headers)
                response = urllib2.urlopen(request,None,5)
                html=response.read().decode('utf-8')
                json_str = json.loads(html)
                ms=json_str['msg']
                if len(ms) <5:
                    break
                msg=ms[0]
            except Exception as e:
                print "eeeee",e
            #print msg
            if msg is not None:
                soup = BeautifulSoup(str(msg))
                blks = soup.find_all('div', {'class' : 'blk'})
                for blk in blks:
                    page=blk.find('a').get('href')
                    if page is not None:
                        node=page.replace("/topic/","") #将更多的种子入库
                        parent=name
                        ne=blk.find('strong').text
                        try:
                            queue.put((node,ne,parent)) #首先放入队列
                            while queue.qsize() >0:
                                n,name,p=queue.get() #顶节点出队
                                size=queue.qsize()
                                if size > 0:
                                    print size
                                getContent(n,name,p,top_id)
                                getChildren(n,name) #出队内容的子节点
                            conn.commit()
                        except Exception as e:
                            print "what's wrong",e 
        except urllib2.URLError, e:
            print "error is",e
            pass
             
         
if __name__ == '__main__':
    i=0
    while i<400:
        new_work()
        i=i+1
PHP可以写网页爬虫吗？原理是什么？

你尚未登录，登录后可以

求java/php大佬帮帮忙？

新人求看下如何用php对接第三方接口?

Hyperf Swagger 如何使用？

Laravel Framework 11.38.2 版本路由调用api接口报错404？

在Windows下开发的gin项目如何不是linux下的宝塔？

在宝塔里这种情况下怎么使用80端口？

如何在lnmp环境下安装pecl_redis扩展并解决编译错误？

PHP可以写网页爬虫吗 ？原理是什么？

你尚未登录，登录后可以

求java/php大佬帮帮忙？

新人求看下如何用php对接第三方接口?

Hyperf Swagger 如何使用？

Laravel Framework 11.38.2 版本 路由调用api接口报错404？

在Windows下开发的gin项目如何不是linux下的宝塔？

在宝塔里这种情况下怎么使用80端口？

如何在lnmp环境下安装pecl_redis扩展并解决编译错误？

PHP可以写网页爬虫吗？原理是什么？

Laravel Framework 11.38.2 版本路由调用api接口报错404？