Regular expression crawling Chinese ancient poetry net and Douban popular books

1, Crawling through the Chinese poetry net

import re
import requests
def handler_url(url):
    header={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
    }
    reponse=requests.get(url,header)
    text=reponse.text
    titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
    years=re.findall(r'<p\sclass="source"><a.*?>(.*?)</a>',text,re.DOTALL)
    autors=re.findall(r'<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    contents_tmp=re.findall(r'<p\sclass="source">.*?<div\sclass="contson"\s.*?>(.*?)</div>',text,re.DOTALL)
    contents=[]
    for c in contents_tmp:
        x=re.sub(r'<.*?>','',c)
        contents.append(x.strip())
    poems=[]
    for value in zip(titles,years,autors,contents):
        title,year,autor,content=value
        poem={
            "title":title,
            "year":year,
            "autor":autor,
            "content":content
        }
        poems.append(poem)
    for a in poems:
        print(a)
def main():
    base_url="https://www.gushiwen.org/default_{}.aspx"
    for i in range(1,7):
        url=base_url.format(i)
        handler_url(url)
if __name__ == '__main__':
    main()


2, Crawling popular books of Douban

import re
import requests
HEADERS={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
}
def get_detail_page(url):
    resp=requests.get(url,headers=HEADERS)
    text=resp.text
    detail_pages=re.findall(r'<div\sclass="main-bd">.*?<a\shref="(.*?)">.*?</a>',text,re.DOTALL)
    return detail_pages
def handle_detail_page(url):
    book={}
    resp=requests.get(url,headers=HEADERS)
    text=resp.text
    titles=re.findall(r'<div\sclass="article">.*?<span.*?>(.*?)</span>',text,re.DOTALL)
    articles_tmp=re.findall(r'<div\sid="link-report">.*?<p>.*</p>',text,re.DOTALL)
    articles=[]
    for a in articles_tmp:
        x=re.sub(r'<.*?>','',a)
        y=x.replace('\r','').replace('\n','')
        a=re.sub(r'<div\s.*?>','',y)
        b=a.replace(' ','')
        articles.append(b)
    # print(articles)
    # print(titles)
    book={
        'title':titles,
        'article':articles
    }
    print(book)
def main():
    urls=[]
    base_url="https://book.douban.com/review/best/?start={}"
    for i in range(0,41,20):
        url=base_url.format(i)
        urls.append(url)
    for a in urls:
        detail_urls=get_detail_page(a)
        for f in detail_urls:
            handle_detail_page(f)
if __name__ == '__main__':
    main()



Three. Conclusion

These two small crawler projects are not anti crawled on the website, so it is relatively simple for us to crawl out the page information. The key lies in how to parse the data. Recently, we have been looking at regular expressions, so we found two small projects to practice. These two small projects can also be performed using the beatifulsoup or xml library, and try other methods to parse again in time Data to ~

Keywords: Python Windows xml

Added by sanju on Sat, 15 Feb 2020 17:42:46 +0200