Automatic Ajax asynchronous request data and crawler crawler

Automatic Ajax asynchronous request data and video crawler crawler
# Close to the bottom of the development, framework development
# scrapy framework
# The hard parts of crawling are written in sugar-flipped JS (complex but inefficient), subsequent cookie s, web addresses to scrapy # or urllib, because they run efficiently.
# xpath expression is also used to extract information, which is more efficient than regular expression and has less ability. Generally, it is only suitable for extracting data in # xml format.
# In order to solve the problem of asynchronous data, first of all, we want to solve the problem of grabbing packets. Through fiddler software, we can use the method of grabbing packets regularly by pulling down and appearing the remaining web sites

#1 video review crawler (single page comment crawler)
import urllib.request
import re
vid="1743283224"#Video id
cid="6399981406690050721"#Comment id
num="20"
#Constructing the Current Comment Website
url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?cursor="+cid+"&orinum="+num
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
    "Content-Type":"application/javascript",
}
opener=urllib.request.build_opener()
headall=[]
for key,value in headers.items():
    item=(key,value)
    headall.append(item)
opener.addheaders=headall
urllib.request.install_opener(opener)
#Crawl the current comment page
data=urllib.request.urlopen(url).read().decode("utf-8")
print(data)
commentpat='"content":"(.*?)"'
commentall=re.compile(commentpat).findall(data)
for i in range(0,len(commentall)):
    try:
        print("The comments are as follows:"+commentall[i])
        print("_______________")
    except Exception as err:
        print(err)
#2. Video review crawler (automatically switch to next page comment crawler): get comments
import urllib.request
import re
vid="1743283224"
cid="0"
num="15"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
    "Content-Type":"application/javascript",
}
opener=urllib.request.build_opener()
headall=[]
for key,value in headers.items():
    item=(key,value)
    headall.append(item)
opener.addheaders=headall
urllib.request.install_opener(opener)
for j in range(0,100):
    #Crawl the current page
    print("The first"+str(j)+"page")
    thisurl="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?cursor="+cid+"&orinum="+num
    data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    commentpat='"content":"(.*?)"'
    commentall=re.compile(commentpat).findall(data)
    lastpat='"last":"(.*?)"'
    cid=re.compile(lastpat).findall(data)[0]
    for i in range(0,len(commentall)):
        try:
            print("The comments are as follows:"+eval('u"'+commentall[i]+'"'))
            print("________")
        except Exception as err:
            print(err)

Keywords: Windows Javascript less xml

Added by russ8 on Thu, 03 Oct 2019 14:10:20 +0300