- The main implementation is to crawl the pictures under all the answers in a favorite
- The text message is not included yet. It can be realized by itself and is simpler than the picture
- There are detailed notes in the specific code, please read them by yourself
Project source code:
1 # -*- coding:utf-8 -*- 2 3 from spider import SpiderHTML 4 from multiprocessing import Pool 5 import sys,urllib,http,os,random,re,time 6 __author__ = 'waiting' 7 ''' 8 Third party class library used BeautifulSoup4,Please install by yourself 9 You need the spider.py file 10 Operating environment: python3.4,windows7 11 ''' 12 13 #Favorite address 14 url = 'https://www.zhihu.com/collection/30822111' #page Change parameter to code add 15 16 #Local storage path,It will be created automatically if it does not exist 17 store_path = 'E:\\zhihu\Favorites\\A world known only to members' 18 19 class zhihuCollectionSpider(SpiderHTML): 20 def __init__(self,pageStart, pageEnd, url): 21 self._url = url 22 self._pageStart = int(pageStart) 23 self._pageEnd = int(pageEnd)+1 24 self.downLimit = 0 #Answers below this approval are not included 25 26 def start(self): 27 for page in range(self._pageStart,self._pageEnd): #Favorite pages 28 url = self._url + '?page='+str(page) 29 content = self.getUrl(url) 30 questionList = content.find_all('div',class_='zm-item') 31 for question in questionList: #Every favorite question 32 Qtitle = question.find('h2',class_='zm-item-title') 33 if Qtitle is None: #Be harmonized 34 continue 35 36 questionStr = Qtitle.a.string 37 Qurl = 'https://www.zhihu.com'+Qtitle.a['href'] #Problem title 38 Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string) #windows file/Special symbols not supported for directory names 39 try: 40 print('-----Getting questions:'+Qtitle+'-----') #Get the link and title of the problem, and go to grab 41 except UnicodeEncodeError: 42 print(r'---The problem contains special characters and cannot be displayed---') 43 try: 44 Qcontent = self.getUrl(Qurl) 45 except: 46 print('!!!!Get error!!!!!') 47 pass 48 answerList = Qcontent.find_all('div',class_='zm-item-answer zm-item-expanded') 49 self._processAnswer(answerList,Qtitle) #Answers to questions 50 time.sleep(5) 51 52 53 def _processAnswer(self,answerList,Qtitle): 54 j = 0 55 for answer in answerList: 56 j = j + 1 57 58 upvoted = int(answer.find('span',class_='count').string.replace('K','000')) #Get the number of approval for this answer 59 if upvoted < self.downLimit: 60 continue 61 authorInfo = answer.find('div',class_='zm-item-answer-author-info') #Get author information 62 author = {'introduction':'','link':''} 63 try: 64 author['name'] = authorInfo.find('a',class_='author-link').string #Get the author's name 65 author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #Get the author's profile 66 author['link'] = authorInfo.find('a',class_='author-link')['href'] 67 except AttributeError: 68 author['name'] = 'Anonymous user'+str(j) 69 except TypeError: #If the profile is empty 70 pass #Anonymous user has no link 71 72 file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt') 73 if os.path.exists(file_name): #It's been grabbed 74 continue 75 76 self.saveText(file_name,'{introduction}\r\n{link}'.format(**author)) #Save author's information 77 print('Getting users`{name}`The answer'.format(**author)) 78 answerContent = answer.find('div',class_='zm-editable-content clearfix') 79 if answerContent is None: #Reported users have no answer content 80 continue 81 82 imgs = answerContent.find_all('img') 83 if len(imgs) == 0: #The answer is not in the picture above 84 pass 85 else: 86 self._getImgFromAnswer(imgs,Qtitle,**author) 87 88 #Picture included 89 def _getImgFromAnswer(self,imgs,Qtitle,**author): 90 i = 0 91 for img in imgs: 92 if 'inline-image' in img['class']: #Don't grab the small picture of Zhihu 93 continue 94 i = i + 1 95 imgUrl = img['src'] 96 extension = os.path.splitext(imgUrl)[1] 97 path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension) 98 try: 99 self.saveImg(imgUrl,path_name) #Capture all kinds of picture exceptions without interrupting the process 100 except: 101 pass 102 103 #Collected characters 104 def _getTextFromAnswer(self): 105 pass 106 107 #Run from the command line, for example: zhihu.py 1 5 Get 1 to 5 pages of data 108 if __name__ == '__main__': 109 page, limit, paramsNum= 1, 0, len(sys.argv) 110 if paramsNum>=3: 111 page, pageEnd = sys.argv[1], sys.argv[2] 112 elif paramsNum == 2: 113 page = sys.argv[1] 114 pageEnd = page 115 else: 116 page,pageEnd = 1,1 117 118 spider = zhihuCollectionSpider(page,pageEnd,url) 119 spider.start()
Many beginners are not clear about the concept of Python. What can C language and python do? When learning, you should follow what route to learn and what direction to develop after learning. To learn more, you can click the Youdao cloud notes link: http://note.youdao.com/noteshare?id=e4fa02e7b56d7909a27674cdb3da08aa