Automatic Ajax asynchronous request data and video crawler crawler
# Close to the bottom of the development, framework development
# scrapy framework
# The hard parts of crawling are written in sugar-flipped JS (complex but inefficient), subsequent cookie s, web addresses to scrapy # or urllib, because they run efficiently.
# xpath expression is also used to extract information, which is more efficient than regular expression and has less ability. Generally, it is only suitable for extracting data in # xml format.
# In order to solve the problem of asynchronous data, first of all, we want to solve the problem of grabbing packets. Through fiddler software, we can use the method of grabbing packets regularly by pulling down and appearing the remaining web sites
#1 video review crawler (single page comment crawler) import urllib.request import re vid="1743283224"#Video id cid="6399981406690050721"#Comment id num="20" #Constructing the Current Comment Website url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?cursor="+cid+"&orinum="+num headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0", "Content-Type":"application/javascript", } opener=urllib.request.build_opener() headall=[] for key,value in headers.items(): item=(key,value) headall.append(item) opener.addheaders=headall urllib.request.install_opener(opener) #Crawl the current comment page data=urllib.request.urlopen(url).read().decode("utf-8") print(data) commentpat='"content":"(.*?)"' commentall=re.compile(commentpat).findall(data) for i in range(0,len(commentall)): try: print("The comments are as follows:"+commentall[i]) print("_______________") except Exception as err: print(err)
#2. Video review crawler (automatically switch to next page comment crawler): get comments import urllib.request import re vid="1743283224" cid="0" num="15" headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0", "Content-Type":"application/javascript", } opener=urllib.request.build_opener() headall=[] for key,value in headers.items(): item=(key,value) headall.append(item) opener.addheaders=headall urllib.request.install_opener(opener) for j in range(0,100): #Crawl the current page print("The first"+str(j)+"page") thisurl="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?cursor="+cid+"&orinum="+num data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") commentpat='"content":"(.*?)"' commentall=re.compile(commentpat).findall(data) lastpat='"last":"(.*?)"' cid=re.compile(lastpat).findall(data)[0] for i in range(0,len(commentall)): try: print("The comments are as follows:"+eval('u"'+commentall[i]+'"')) print("________") except Exception as err: print(err)