Preface
The text and pictures of the article are from the Internet, only for learning and communication, and do not have any commercial use. The copyright belongs to the original author. If you have any questions, please contact us in time for handling.
Author: tomato ha ha
PS: if you need Python learning materials, you can click the link below to get them by yourself
http://note.youdao.com/noteshare?id=3054cce4add8a909e784ad934f956cef
This preparation introduces Python crawler crawling web data, parsing and application in practice, and intends to write several articles, starting with the most basic Python crawler syntax, to introduce the crawler step by step, and finally realize a more complete example.
This series includes:
-
Introduction and application of request Library
-
Introduction and application of beautifulsoup Library
-
Regular expression matching and its application
-
Example of dialogue robot
-
tkinter library making interface and Python program are packed into executable exe file
This article mainly gives a demonstration of exe application that I finally made, the main purpose is to improve the readers' interest in learning.
Finally, the chat robot can crawl the information in the designated website according to the user's input instructions, parse it in the background, and return the results to the interface.
I put this exe application at the end, which can be run directly on the computer. You can download it to play or send it to your friends to show it.
Run the screenshot as follows:
Users can input instructions, such as: "tell me a joke", "tell me a joke", "find me a good night on the Internet", etc. As follows:
The development tool I use is Spyder. In the following articles, I will gradually explain the principle and implementation method of this example.
The code is attached below. Interested students can copy their own running and try it.
1 from tkinter import * 2 import time 3 import requests 4 from bs4 import BeautifulSoup 5 import bs4 6 import random 7 import re 8 def getHTMLText(url): 9 try: 10 r = requests.get(url, timeout=30) 11 r.raise_for_status() 12 #r.encoding = r.apparent_encoding 13 return r.text 14 except: 15 print("faile") 16 return "" 17 18 def fillUnivList(ulist, html): 19 soup = BeautifulSoup(html, "html.parser") 20 for tr in soup.find_all('article'): 21 if isinstance(tr, bs4.element.Tag): 22 tds = tr('a') 23 ulist.append([tds[0].string, tds[1].string, tds[2].string]) 24 def printUnivList(ulist,k): 25 return ulist[k][0] 26 27 def getduanzi(): 28 uinfo = [] 29 k=1 30 u = 'http://duanziwang.com/category/%E4%B8%80%E5%8F%A5%E8%AF%9D%E6%AE%B5%E5%AD%90/' 31 i=random.randint(1,49) 32 url=u+str(i)+'/' 33 html = getHTMLText(url) 34 fillUnivList(uinfo, html) 35 k=random.randint(0,9) 36 return printUnivList(uinfo,k) 37 def fill2(ulist,html): 38 soup = BeautifulSoup(html, "html.parser") 39 for tr in soup.find_all('div','article block untagged mb15 typs_hot'): 40 if isinstance(tr, bs4.element.Tag): 41 tds = tr.find('div','content') 42 tdss=tds('span') 43 reg = re.compile('<[^>]*>') 44 text=reg.sub('',str(tdss)) 45 regg = re.compile('\\[|\\]|\\n') 46 text=regg.sub('',text) 47 ulist.append(text) 48 def getjoke(): 49 ulist=[] 50 u='https://www.qiushibaike.com/text/page/' 51 i=random.randint(1,13) 52 url=u+str(i)+'/' 53 html=getHTMLText(url) 54 fill2(ulist,html) 55 k=random.randint(0,16) 56 #print(str(ulist[k])) 57 return str(ulist[k]) 58 def getHTMLText2(url): 59 try: 60 r = requests.get(url, timeout=30) 61 r.raise_for_status() 62 r.encoding = r.apparent_encoding 63 return r.text 64 except: 65 return "" 66 def geturl(url,text): 67 html=getHTMLText2(url) 68 soup = BeautifulSoup(html, "html.parser") 69 for tds in soup.find_all('a'): 70 #print(tds.string) 71 if str(tds.string) in text: 72 #print(tds.attrs['href']) 73 return tds.attrs['href'] 74 return '' 75 def getmoreurl(url): 76 html=getHTMLText2(url) 77 soup = BeautifulSoup(html, "html.parser") 78 try: 79 li=soup.find_all('li',attrs={'class':'articleTitle fl'}) 80 k=random.randint(0,len(li)-1) 81 return li[k]('a')[0].attrs['href'] 82 except: 83 return '' 84 def getsen(url): 85 html=getHTMLText2(url) 86 soup = BeautifulSoup(html, "html.parser") 87 try: 88 li=soup.find('p') 89 reg = re.compile('<p>.*') 90 l=reg.findall(str(li)) 91 #print(l) 92 k=random.randint(0,len(l)-1) 93 text=re.findall('[\u4e00-\u9fa5]+',l[k]) 94 x='' 95 if len(text): 96 for t in text[:-2]: 97 x=x+t+',' 98 return x+text[-1]+'. ' 99 else: 100 return "Sorry, there is a little problem, please try again!" 101 except: 102 return "Sorry, I didn't find what you wanted" 103 def getsentance(text): 104 start_url = 'http://www.siandian.com' 105 urll='http://www.siandian.com/tags.html' 106 end1=geturl(urll,text) 107 if end1=='': 108 return "Sorry, I can't find what you want." 109 else: 110 end2=getmoreurl(start_url+end1) 111 if end2=='': 112 return "Sorry, I can't find what you want." 113 else: 114 #print(start_url+end2) 115 return getsen(start_url+end2) 116 def xiaotang(s): 117 sign=1; 118 while(sign): 119 if 'Duan Zi' in s: 120 while('Duan Zi' in s or 'Continue' in s or 'Again' in s or s==''): 121 return getduanzi() 122 elif 'joke' in s: 123 while('joke' in s or 'Continue' in s or 'Again' in s or s==''): 124 return getjoke() 125 elif 'sentence' in s or 'word' in s: 126 return getsentance(s) 127 elif 'Fool' in s or 'grass' in s or 'day' in s: 128 return 'It's a swearing. You can't say it' 129 elif 'Two' in s or 'garbage' in s or 'Idiot' in s: 130 t='Are you the devil?' 131 x='' 132 for i in range(10): 133 x=x+t+' !'+'\n' 134 return x 135 else: 136 return "I don't seem to understand\n" 137 def main(): 138 def start(): 139 strMsg = 'Small sugar:' + time.strftime("%Y-%m-%d %H:%M:%S", 140 time.localtime()) + '\n ' 141 txtget.insert(END, strMsg, 'redcolor') 142 txtget.insert(END, 'Hello, what can I do for you?') 143 def sendMsg():#send message 144 t=txtMsg.get('0.0', END) 145 txtMsg.delete('0.0', END) 146 strMsg = 'I:' + time.strftime("%Y-%m-%d %H:%M:%S", 147 time.localtime()) + '\n ' 148 for i in range(int(txtget.index(END).split(".")[0])-int(txtMsgList.index(END).split(".")[0])+1): 149 txtMsgList.insert(END, '\n') 150 txtMsgList.insert(END, strMsg, 'greencolor') 151 txtMsgList.insert(END, t) 152 txtMsgList.see(END) 153 for i in range(int(txtMsgList.index(END).split(".")[0])-int(txtget.index(END).split(".")[0])+1): 154 txtget.insert(END, '\n') 155 txtget.see(END) 156 157 158 159 strMsg = 'Small sugar:' + time.strftime("%Y-%m-%d %H:%M:%S", 160 time.localtime()) + '\n ' 161 for i in range(int(txtMsgList.index(END).split(".")[0])-int(txtget.index(END).split(".")[0])+1): 162 txtget.insert(END, '\n') 163 txtget.insert(END, strMsg, 'redcolor') 164 txtget.insert(END, xiaotang(t)) 165 txtget.see(END) 166 for i in range(int(txtget.index(END).split(".")[0])-int(txtMsgList.index(END).split(".")[0])+1): 167 txtMsgList.insert(END, '\n') 168 txtMsgList.see(END) 169 170 171 def cancelMsg():#Cancel message 172 txtMsg.delete('0.0', END) 173 174 def sendMsgEvent(event): #Send message event 175 sendMsg() 176 177 #create a window 178 t = Tk() 179 t.title('Little sugar assistant') 180 181 #Establish frame container 182 frmLT = Frame(width=500, height=320, bg='#F19C8B') 183 frmLC = Frame(width=500, height=150, bg='#F19C8B') 184 frmLB = Frame(width=500, height=30,bg='white') 185 frmRT = Frame(width=200, height=500,bg='#F19C8B') 186 187 #Create controls 188 txtMsgList = Text(frmLT,width=40,bd=0) 189 txtMsgList.tag_config('greencolor', foreground='#008C00')#Establish tag 190 txtMsg = Text(frmLC) 191 txtget = Text(frmLT,width=40,bd=0) 192 txtget.tag_config('redcolor', foreground='#DC143C')#Establish tag 193 start() 194 #txtMsg.bind("", sendMsgEvent) 195 txtMsg.bind('<Return>',sendMsgEvent) 196 btnSend = Button(frmLB, text='Send out', width = 8, command=sendMsg,bg='#E88384',bd=0) 197 btnCancel = Button(frmLB, text='cancel', width = 8, command=cancelMsg,bg='#F3ADA0',bd=0) 198 scollor=Scrollbar(bg='white') 199 scollor.config(command=txtget.yview) 200 scollor.config(command=txtMsgList.yview) 201 txtget.config(yscrollcommand=scollor.set) 202 txtMsgList.config(yscrollcommand=scollor.set) 203 imgInfo = PhotoImage(file = "aa.png") 204 lblImage = Label(frmRT, image = imgInfo) 205 lblImage.image = imgInfo 206 207 #Window layout 208 frmLT.grid(row=0, column=0, columnspan=2, padx=0, pady=0) 209 frmLC.grid(row=1, column=0, columnspan=2, padx=0, pady=0) 210 frmLB.grid(row=2, column=0, columnspan=2,padx=0) 211 scollor.grid(row=0,column=2,sticky=N+S) 212 frmRT.grid(row=0, column=3, rowspan=3, padx=0, pady=0) 213 #Fixed size 214 frmLT.grid_propagate(0) 215 frmLC.grid_propagate(0) 216 frmLB.grid_propagate(0) 217 frmRT.grid_propagate(0) 218 219 btnSend.grid(row=2, column=0) 220 btnCancel.grid(row=2, column=1) 221 lblImage.grid() 222 txtget.grid(row=0,column=0) 223 224 txtMsgList.grid(row=0,column=1) 225 226 txtMsg.grid() 227 228 #Main event cycle 229 t.mainloop() 230 231 if __name__ == '__main__': 232 main()