A simple python crawler, crawling Zhihu

  • The main implementation is to crawl the pictures under all the answers in a favorite
  • The text message is not included yet. It can be realized by itself and is simpler than the picture
  • There are detailed notes in the specific code, please read them by yourself

Project source code:

  1 # -*- coding:utf-8 -*-
  2 
  3 from spider import SpiderHTML
  4 from multiprocessing import Pool
  5 import sys,urllib,http,os,random,re,time
  6 __author__ = 'waiting'
  7 '''
  8 Third party class library used BeautifulSoup4,Please install by yourself
  9 You need the spider.py file
 10 Operating environment: python3.4,windows7
 11 '''
 12 
 13 #Favorite address
 14 url = 'https://www.zhihu.com/collection/30822111'  #page Change parameter to code add
 15 
 16 #Local storage path,It will be created automatically if it does not exist
 17 store_path = 'E:\\zhihu\Favorites\\A world known only to members'
 18 
 19 class zhihuCollectionSpider(SpiderHTML):
 20   def __init__(self,pageStart, pageEnd, url):
 21     self._url = url
 22     self._pageStart = int(pageStart)
 23     self._pageEnd = int(pageEnd)+1
 24     self.downLimit = 0            #Answers below this approval are not included
 25 
 26   def start(self):
 27     for page in range(self._pageStart,self._pageEnd):    #Favorite pages
 28       url = self._url + '?page='+str(page)
 29       content = self.getUrl(url)
 30       questionList = content.find_all('div',class_='zm-item')
 31       for question in questionList:            #Every favorite question
 32         Qtitle = question.find('h2',class_='zm-item-title')
 33         if Qtitle is None:                #Be harmonized
 34           continue
 35 
 36         questionStr = Qtitle.a.string
 37         Qurl = 'https://www.zhihu.com'+Qtitle.a['href']  #Problem title
 38         Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string)      #windows file/Special symbols not supported for directory names
 39         try:
 40           print('-----Getting questions:'+Qtitle+'-----')    #Get the link and title of the problem, and go to grab
 41         except UnicodeEncodeError:
 42           print(r'---The problem contains special characters and cannot be displayed---')
 43         try:
 44           Qcontent = self.getUrl(Qurl)
 45         except:
 46           print('!!!!Get error!!!!!')
 47           pass
 48         answerList = Qcontent.find_all('div',class_='zm-item-answer  zm-item-expanded')
 49         self._processAnswer(answerList,Qtitle)            #Answers to questions
 50         time.sleep(5)
 51 
 52 
 53   def _processAnswer(self,answerList,Qtitle):
 54     j = 0      
 55     for answer in answerList:
 56       j = j + 1
 57       
 58       upvoted = int(answer.find('span',class_='count').string.replace('K','000'))   #Get the number of approval for this answer
 59       if upvoted < self.downLimit:
 60         continue
 61       authorInfo = answer.find('div',class_='zm-item-answer-author-info')        #Get author information
 62       author = {'introduction':'','link':''}
 63       try:
 64         author['name'] = authorInfo.find('a',class_='author-link').string       #Get the author's name
 65         author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #Get the author's profile
 66         author['link'] = authorInfo.find('a',class_='author-link')['href']      
 67       except AttributeError:
 68         author['name'] = 'Anonymous user'+str(j)
 69       except TypeError:                                  #If the profile is empty
 70         pass                                     #Anonymous user has no link
 71 
 72       file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt')
 73       if os.path.exists(file_name):              #It's been grabbed
 74         continue
 75   
 76       self.saveText(file_name,'{introduction}\r\n{link}'.format(**author))      #Save author's information
 77       print('Getting users`{name}`The answer'.format(**author))
 78       answerContent = answer.find('div',class_='zm-editable-content clearfix')
 79       if answerContent is None:                #Reported users have no answer content
 80         continue
 81   
 82       imgs = answerContent.find_all('img')
 83       if len(imgs) == 0:                    #The answer is not in the picture above
 84         pass
 85       else:
 86         self._getImgFromAnswer(imgs,Qtitle,**author)
 87 
 88   #Picture included
 89   def _getImgFromAnswer(self,imgs,Qtitle,**author):
 90     i = 0
 91     for img in imgs:
 92       if 'inline-image' in img['class']:          #Don't grab the small picture of Zhihu
 93         continue
 94       i = i + 1
 95       imgUrl = img['src']
 96       extension = os.path.splitext(imgUrl)[1]
 97       path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension)
 98       try:
 99         self.saveImg(imgUrl,path_name)          #Capture all kinds of picture exceptions without interrupting the process
100       except:                  
101         pass
102         
103   #Collected characters
104   def _getTextFromAnswer(self):
105     pass
106 
107 #Run from the command line, for example: zhihu.py 1 5   Get 1 to 5 pages of data
108 if __name__ == '__main__':
109   page, limit, paramsNum= 1, 0, len(sys.argv)
110   if paramsNum>=3:
111     page, pageEnd = sys.argv[1], sys.argv[2]
112   elif paramsNum == 2:
113     page = sys.argv[1]
114     pageEnd = page
115   else:
116     page,pageEnd = 1,1
117 
118   spider = zhihuCollectionSpider(page,pageEnd,url)
119   spider.start()

Many beginners are not clear about the concept of Python. What can C language and python do? When learning, you should follow what route to learn and what direction to develop after learning. To learn more, you can click the Youdao cloud notes link: http://note.youdao.com/noteshare?id=e4fa02e7b56d7909a27674cdb3da08aa

Keywords: Python Windows C

Added by JimmyD on Wed, 25 Dec 2019 16:26:26 +0200