Data acquisition for the construction of knowledge atlas of ancient Chinese Poetry

Data acquisition

Poetry data

originate: Search ancient poetry network

The corresponding poetry section constructs the poetry knowledge map and crawls the poetry of the Five Dynasties of Tang, song, yuan, Ming, Qing and so on

According to the dynasties to crawl one by one, paging crawl, you can find the law of paging

The first page of ancient poetry in the Tang Dynasty url The link format is as follows: https://www.xungushici.com/shicis/cd-tang-p-1
Page 2: https://www.xungushici.com/shicis/cd-tang-p-2

You can see the law, you can crawl page by page.

The data to be obtained include: Poetry name, poetry writing Dynasty, author, poetry content, poetry classification, translation, annotation, appreciation and creation background

poem.py: got the name of poetry, poetry writing Dynasty, author, poetry content, translation, appreciation and appreciation

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#Create header information
pom_list=[]
k=1
for i in range(10001,10654):
    url='https://www.xungushici.com/shicis/cd-qing-p-'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('div',class_='col col-sm-12 col-lg-9')
    list=hed.find_all('div',class_="card mt-3")
    # print(len(list))

    for it in list:
        content = {}
        #1.1 Get all poetry pages
        href=it.find('h4',class_='card-title').a['href']
        real_href='https://www.xungushici.com'+href
        title=it.find('h4',class_='card-title').a.text
        #2.1 Crawling Poetry
        get = requests.get(real_href).text
        selector = etree.HTML(get)
        #2.2 Get title
        xtitle=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/h3/text()')[0]
        #2.3 Get Dynasty
        desty=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/a/text()')[0]
        #2.4 Get author
        if len(selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/span/text()'))==0:
            author=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/a[2]/text()')[0]
        else:
            author =selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/p/span/text()')[0]
        #2.5 Get article
        ans=""
        if len(selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/p/text()'))==0:
            artical=selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/text()')
            for it in artical:
                ans=ans+it.replace("\r","").replace("\t","").replace("\n","")
        else:
            artical = selector.xpath('/html/body/div[1]/div/div[1]/div[1]/div/div[1]/p/text()')
            for it in artical:
                ans=ans+it.replace("\r","").replace("\t","").replace("\n","")
        #2.6 Get translation
        trans=""
        flag=0
        for j in range(2,8):
            path='/html/body/div[1]/div/div[1]/div[2]/div[2]/p[%d]'%j
            if selector.xpath(path+'/text()')==[]:
                break
            else:
                translist=selector.xpath(path+'/text()')
                for it in translist:
                    trans = trans + it + "\n"
        #2.7 Gain appreciation
        appear=""
        for j in range(1,19):
            path='/html/body/div[1]/div/div[1]/div[3]/div[2]/p[%d]'%j
            if selector.xpath(path+'/text()')==[]:
                break
            else:
                apperlist=selector.xpath(path+'/text()')
                for it in apperlist:
                    appear = appear + it + "\n"
        #2.8 Creative background
        background=selector.xpath('/html/body/div[1]/div/div[1]/div[4]/div[2]/p/text()')
        text_back=""
        if background!=[]:
            for it in background:
                text_back=text_back+it+"\n"
        content['title']=xtitle
        content['desty']=desty
        content['author']=author
        content['content']=ans
        content['trans_content']=trans
        content['appear']=appear
        content['background']=text_back
        pom_list.append(content)
        print("The first"+str(k)+"individual")
        k=k+1

import xlwt

xl = xlwt.Workbook()
# Of the calling object add_sheet method
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"title")
sheet1.write(0,1,'desty')
sheet1.write(0,2,'author')
sheet1.write(0,3,'content')
sheet1.write(0,4,'trans_content')
sheet1.write(0,5,'appear')
sheet1.write(0,6,'background')

for i in range(0,len(pom_list)):
    sheet1.write(i+1,0,pom_list[i]['title'])
    sheet1.write(i+1, 1, pom_list[i]['desty'])
    sheet1.write(i+1, 2, pom_list[i]['author'])
    sheet1.write(i+1, 3, pom_list[i]['content'])
    sheet1.write(i+1, 4, pom_list[i]['trans_content'])
    sheet1.write(i+1, 5, pom_list[i]['appear'])
    sheet1.write(i+1, 6, pom_list[i]['background'])
xl.save("qing3.xlsx")
# print(pom_list)

Classification and annotation need to be re crawled and supplemented in the later stage

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#Create header information
pom_list=[]
k=1
for i in range(1,1000):
    url='https://www.xungushici.com/shicis/cd-yuan-p-'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('div',class_='col col-sm-12 col-lg-9')
    list=hed.find_all('div',class_="card mt-3")
    # print(len(list))

    for it in list:
        content = {}
        #1.1 Get all poetry collections on a single page
        href=it.find('h4',class_='card-title').a['href']
        real_href='https://www.xungushici.com'+href
        title=it.find('h4',class_='card-title').a.text
        print(title)
        #2.1 Crawling Poetry
        r2 = requests.get(real_href, headers=headers)
        content2 = r2.content.decode('utf-8')
        soup2 = BeautifulSoup(content2, 'html.parser')
        zhu = ""
        if soup2.find('div',class_='card mt-3')==[]:
            zhu="nothing"
            content['title'] = title
            content['zhu'] = zhu
            pom_list.append(content)
            print("The first" + str(k) + "individual")
            k = k + 1
            continue
        card_div=soup2.find('div',class_='card mt-3')

        if card_div==None or card_div.find('div',class_='card-body')==[]:
            zhu="nothing"
            content['title'] = title
            content['zhu'] = zhu
            pom_list.append(content)
            print("The first" + str(k) + "individual")
            k = k + 1
            continue
        card_body=card_div.find('div',class_='card-body')
        p_list=card_body.find_all('p')
        flag=1
        for it in p_list:
            if str(it).find('strong')!=-1 and it.find('strong').text=='notes':
                flag=0
                continue
            if flag==0:
                zhu=zhu+str(it)
        if len(zhu)==0:
            zhu="nothing"
        content['title']=title
        content['zhu']=zhu
        pom_list.append(content)
        print("The first"+str(k)+"individual")
        k=k+1

import xlwt

xl = xlwt.Workbook()
# Of the calling object add_sheet method
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"title")

sheet1.write(0,12,'zhu')

for i in range(0,len(pom_list)):
    sheet1.write(i+1,0,pom_list[i]['title'])
    sheet1.write(i+1, 12, pom_list[i]['zhu'])
xl.save("yuan.xlsx")
# print(pom_list)

Poetry form, poetry writing time, word card name, Qupai name acquisition

poem_data.py obtains the time of writing poetry, uses crf to analyze the background of writing poetry and the words in appreciation, extracts the time words of writing poetry, and filters them.

#According to the appreciation and background of ancient poetry, the specific creation time of poetry is extracted

import re
from pyhanlp import *
import pandas as pd
#Name of person“ nr"
#Place names“ ns"
#Institution name“ nt"

def demo_CRF_lexical_analyzer(text):
    CRFnewSegment = HanLP.newSegment("crf")
    term_list = CRFnewSegment.seg(text)
    ans=[]
    for it in term_list:
        if str(it.nature)=='t' or str(it.nature)=='m':
            ans.append(str(it.word))
    #print(ans)
    return ans

from xlrd import open_workbook
from xlutils.copy import copy

#Re write the classification results to the original excel in
def write_to(data,file):
    print(len(data))
    xl =open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 9, "data")
    for i in range(0, len(data)):
        sheet1.write(i + 1, 9, data[i])

    excel.save(file)

#Gets the folder under the specified folder excel
import os
def get_filename(path,filetype):  # Enter path, file type, for example'.xlsx'
    name = []
    for root,dirs,files in os.walk(path):
        for i in files:
            if os.path.splitext(i)[1]==filetype:
                name.append(i)
    return name            # Outputs a list of file names with suffixes

if __name__ == '__main__':
    file = 'data/'
    list = get_filename(file, '.xlsx')
    for it in list:
        newfile = file + it
        pome_time = []
        print("start"+str(newfile))
        data=pd.read_excel(newfile).fillna("nothing")
        appear=data.appear
        back=data.background
        if len(appear)>5000:
            maxn=5000
        else:
            maxn=len(appear)
        for i in range(maxn):
            print("The first"+str(i+1)+"Number:")
            app=appear[i]
            bk=back[i]
            if app=="nothing" and bk =="nothing":
                pome_time.append("nothing")
                print("nothing")
                continue
            #print("===============appreciate===================")
            app_time=demo_CRF_lexical_analyzer(app)
            #print("===============background===================")
            bk_time=demo_CRF_lexical_analyzer(bk)

            f=False
            for it in bk_time:
                if bool(re.search(r'\d', it))  == True:
                    print(it)
                    pome_time.append(it)
                    f=True
                    break
            if f==False:
                for it in app_time:
                    if bool(re.search(r'\d', it)) == True:
                        print(it)
                        pome_time.append(it)
                        f=True
                        break
            if f==False:
                pome_time.append("nothing")
                print("nothing")

        write_to(pome_time,newfile)

An analysis of the form of poetry and the names of Ci and Qu

Poetic form: seven character rhyme, seven character quatrains, seven character, five character rhyme, five character quatrains, five character

After crawling through the web page, analyze the poetry names, and then get the corresponding poetry brand names

Final result:

 

Total: 480122

Tang: 48330; Song Dynasty: 200000; Yuan: 39240; Description: 100000; Qing Dynasty: 92550

Poet data

Get from: Poet section of ancient poetry network

Like the poetry section, get the corresponding Dynasty poets and crawl them by page

Poet information to be obtained: poet name, poet Dynasty, poet profile, poet's head url, and corresponding person's life

import requests
from bs4 import BeautifulSoup
from lxml import etree
import re

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#Create header information
pom_list=[]
k=1
for i in range(1,2010):
    url='https://www.xungushici.com/authors/p-'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('div',class_='col col-sm-12 col-lg-9')
    list=hed.find_all('div',class_="card mt-3")

    origin_url='https://www.xungushici.com'

    for it in list:
        content = {}
        # 1.1 Get all poetry collections on a single page
        title = it.find('h4', class_='card-title')
        poemauthor=title.find_all('a')[1].text
        desty=title.find_all('a')[0].text
        #print(poemauthor+"+"+desty)
        #Get poet image
        if it.find('a',class_='ml-2 d-none d-md-block')!=None:
            src=it.find('a',class_='ml-2 d-none d-md-block').img['src']
        else:
            src="http://www.huihua8.com/uploads/allimg/20190802kkk01/1531722472-EPucovIBNQ.jpg"
        content['src']=src
        href=title.find_all('a')[1]['href']
        #Corresponding poet personal details page
        real_href = origin_url + href
        #Introduction to the poet and the number of poetry collections
        text = it.find('p', class_='card-text').text
        #Number of poems collected by the poet
        numtext = it.find('p', class_='card-text').a.text
        pattern = re.compile(r'\d+')
        num=re.findall(pattern, numtext)[0]

        #Enter the poet details page
        r2=requests.get(real_href,headers=headers)
        content2=r2.content.decode('utf-8')
        soup2 = BeautifulSoup(content2, 'html.parser')
        ul=soup2.find('ul',class_='nav nav-tabs bg-primary')
        if ul!=None:
            list_li=ul.find_all('li',class_='nav-item')
            exp = ""
            for it in list_li:
                if it.a.text=="Character life" or it.a.text=="character" or it.a.text=="Life":
                    urlsp=origin_url+it.a['href']
                    r3 = requests.get(urlsp, headers=headers)
                    content3 = r3.content.decode('utf-8')
                    soup3 = BeautifulSoup(content3, 'html.parser')
                    list_p=soup3.select('body > div.container > div > div.col.col-sm-12.col-lg-9 > div:nth-child(3) > div.card > div')
                    for it in list_p:
                        exp=it.get_text().replace('\n','').replace('\t','').replace('\r','')
            content['author']=poemauthor
            content['produce']=text
            content['num']=num
            content['desty'] = desty
            content['experience'] = exp
            pom_list.append(content)
        else:
            content['author'] = poemauthor
            content['produce'] = text
            content['num'] = num
            content['desty'] = desty
            content['experience'] = "nothing"
            pom_list.append(content)
        print("The first"+str(k)+"individual")
        k=k+1

import xlwt

xl = xlwt.Workbook()
# Of the calling object add_sheet method
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"author")
sheet1.write(0,1,'produce')
sheet1.write(0,2,'num')
sheet1.write(0,3,'experience')
sheet1.write(0,4,'src')
sheet1.write(0,5,'desty')

for i in range(0,len(pom_list)):
    sheet1.write(i+1,0,pom_list[i]['author'])
    sheet1.write(i+1, 1, pom_list[i]['produce'])
    sheet1.write(i+1, 2, pom_list[i]['num'])
    sheet1.write(i+1, 3, pom_list[i]['experience'])
    sheet1.write(i + 1, 4, pom_list[i]['src'])
    sheet1.write(i + 1, 5, pom_list[i]['desty'])
xl.save("author3.xlsx")

Retrieve the character's life field and crawl its html format information to facilitate the foreground display

import requests
from bs4 import BeautifulSoup
from lxml import etree
import re

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#Create header information
pom_list=[]
k=1
#2010
for i in range(1,2010):
    url='https://www.xungushici.com/authors/p-'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('div',class_='col col-sm-12 col-lg-9')
    list=hed.find_all('div',class_="card mt-3")

    origin_url='https://www.xungushici.com'

    for it in list:
        content = {}
        # 1.1 Get all poetry collections on a single page
        title = it.find('h4', class_='card-title')
        poemauthor=title.find_all('a')[1].text
        #print(poemauthor)

        href=title.find_all('a')[1]['href']
        #Corresponding poet personal details page
        real_href = origin_url + href

        #Enter the poet details page
        r2=requests.get(real_href,headers=headers)
        content2=r2.content.decode('utf-8')
        soup2 = BeautifulSoup(content2, 'html.parser')
        ul=soup2.find('ul',class_='nav nav-tabs bg-primary')
        if ul!=None:
            list_li=ul.find_all('li',class_='nav-item')
            exp = ""
            for it in list_li:
                if it.a.text=="Character life" or it.a.text=="character" or it.a.text=="Life":
                    urlsp=origin_url+it.a['href']
                    r3 = requests.get(urlsp, headers=headers)
                    content3 = r3.content.decode('utf-8')
                    soup3 = BeautifulSoup(content3, 'html.parser')
                    list_p=soup3.select('body > div.container > div > div.col.col-sm-12.col-lg-9 > div:nth-child(3) > div.card > div.card-body')
                    exp=str(list_p[0])
                    #print(exp)
                    #print(list_p[0])
                    # for it in list_p:
                    #     exp=it.get_text().replace('\n','').replace('\t','').replace('\r','')
            content['author']=poemauthor
            content['experience'] = exp
            pom_list.append(content)
        else:
            content['author'] = poemauthor
            content['experience'] = "nothing"
            pom_list.append(content)
        print("The first"+str(k)+"individual")
        k=k+1

import xlwt

xl = xlwt.Workbook()
# Of the calling object add_sheet method
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"author")
sheet1.write(0,3,'experience')

for i in range(0,len(pom_list)):
    sheet1.write(i+1,0,pom_list[i]['author'])
    sheet1.write(i+1, 3, pom_list[i]['experience'])
xl.save("author_new.xlsx")

According to the poet's profile, extract its birth time, death time, word and number information

#According to the poet's personal profile: born, died, word, number

import pandas as pd
import re
from xlrd import open_workbook
from xlutils.copy import copy

def read_author():
    file= "data2/author.xlsx"
    data=pd.read_excel(file).fillna("nothing")
    produce=list(data.produce)
    i=1
    bg=[]
    ed=[]
    zi=[]
    hao=[]
    pome_self=[]
    #Get the number of poetry collections of poets
    num=list(data.num)

    for it in produce:
        #Get poet profile
        pome_self.append(it)

        print("The first"+str(i)+"A poet:")
        # Get the year the poet was born and died
        datas=re.findall(r"\d+",it)
        if len(datas)!=0 and len(datas)!=1:
            bg.append(datas[0]+"year")
            #print("Born"+datas[0])
            flag=False
            for j in range(1,len(datas)):
                if len(datas[j])>=len(datas[0]) and int(datas[j])-int(datas[0])>15:
                    ed.append(datas[j]+"year")
                    #print("die from"+datas[j])
                    flag=True
                    break
            if flag==False:
                ed.append("nothing")
        else:
            bg.append("nothing")
            ed.append("nothing")

        # Get poet, word, number
        ztext=re.findall(r".*word(.*?)[,|. ]",it)
        if len(ztext)!=0:
            zi.append(ztext)
        else:
            zi.append("nothing")
        #print(ztext)
        htext = str(re.findall(r".*number(.*?)[,|. ]", it)).replace('"','').replace('"','').replace('[','').replace(']','').replace('\'','')
        if len(htext)!=0:
            hao.append(htext)
        else:
            hao.append("nothing")
        #print(htext)
        i = i + 1

    xl = open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 6, "begin_time")
    sheet1.write(0,7,"end_time")
    sheet1.write(0,8,"zi")
    sheet1.write(0,9,"hao")
    for i in range(0, len(bg)):
        sheet1.write(i + 1, 6, bg[i])
        sheet1.write(i + 1, 7, ed[i])
        sheet1.write(i + 1, 8, zi[i])
        sheet1.write(i + 1, 9, hao[i])

    excel.save(file)




if __name__ == '__main__':
    read_author()

Poet friend

Get the poet's name of the corresponding Dynasty from the poet's personal experience

#According to the poet's personal profile: born, died, word, number

import pandas as pd
import re
from xlrd import open_workbook
from xlutils.copy import copy

def read_author():
    file= "data2/author.xlsx"
    data=pd.read_excel(file).fillna("nothing")
    produce=list(data.produce)
    i=1
    bg=[]
    ed=[]
    zi=[]
    hao=[]
    pome_self=[]
    #Get the number of poetry collections of poets
    num=list(data.num)

    for it in produce:
        #Get poet profile
        pome_self.append(it)

        print("The first"+str(i)+"A poet:")
        # Get the year the poet was born and died
        datas=re.findall(r"\d+",it)
        if len(datas)!=0 and len(datas)!=1:
            bg.append(datas[0]+"year")
            #print("Born"+datas[0])
            flag=False
            for j in range(1,len(datas)):
                if len(datas[j])>=len(datas[0]) and int(datas[j])-int(datas[0])>15:
                    ed.append(datas[j]+"year")
                    #print("die from"+datas[j])
                    flag=True
                    break
            if flag==False:
                ed.append("nothing")
        else:
            bg.append("nothing")
            ed.append("nothing")

        # Get poet, word, number
        ztext=re.findall(r".*word(.*?)[,|. ]",it)
        if len(ztext)!=0:
            zi.append(ztext)
        else:
            zi.append("nothing")
        #print(ztext)
        htext = str(re.findall(r".*number(.*?)[,|. ]", it)).replace('"','').replace('"','').replace('[','').replace(']','').replace('\'','')
        if len(htext)!=0:
            hao.append(htext)
        else:
            hao.append("nothing")
        #print(htext)
        i = i + 1

    xl = open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 6, "begin_time")
    sheet1.write(0,7,"end_time")
    sheet1.write(0,8,"zi")
    sheet1.write(0,9,"hao")
    for i in range(0, len(bg)):
        sheet1.write(i + 1, 6, bg[i])
        sheet1.write(i + 1, 7, ed[i])
        sheet1.write(i + 1, 8, zi[i])
        sheet1.write(i + 1, 9, hao[i])

    excel.save(file)




if __name__ == '__main__':
    read_author()

The results obtained by poet friends are:

 

 

Poet information

 

 

Poets

 

 

Collect all the names

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#Create header information

hc=[]

url='https://www.xungushici.com/authors'
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
orign_href='https://www.xungushici.com'


hecheng=soup.find('div',id='divHeCheng')
list=hecheng.find_all('li',class_="m-1 badge badge-light")
dic={}
for i in range(1,len(list)):
    href=orign_href+list[i].a['href']
    hecehng=list[i].a.text
    hc.append(hecehng)
    r2 = requests.get(href, headers=headers)
    content2 = r2.content.decode('utf-8')
    soup2 = BeautifulSoup(content2, 'html.parser')
    pomdiv=soup2.find('div',class_='col col-sm-12 col-lg-9')
    card=pomdiv.find_all('div',class_='card mt-3')
    author_list=[]
    for it in card:
        h4=it.find('h4',class_='card-title')
        list_a=h4.find_all('a')
        desty=list_a[0].text
        author=list_a[1].text
        author_list.append(author)
    dic[hecehng]=",".join(author_list)

import xlwt

xl = xlwt.Workbook()
# Of the calling object add_sheet method
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"hc")
sheet1.write(0,1,'author')
for i in range(0,len(hc)):
    sheet1.write(i+1,0,hc[i])
    sheet1.write(i+1,1,dic[hc[i]])

xl.save("common_name.xlsx")


for it in hc:
    print(it+": "+dic[it])

After that, the table will be read and a column of common name attributes will be added to the poet table:

 

 

Poet track information

Source of track information acquisition: the poet's personal experience: time, place, characters and events are extracted through the character's life

Add custom prediction: author name, Dynasty year, used to divide time words

By identifying the part of speech of a single sentence, find out the time word, person name, place name and action

So as to realize the analysis of the poet's individual

# coding:utf-8
#Code content
#Draw the poet's personal experience: time, place, characters and events, and draw it from these four aspects
import re
from pyhanlp import *
import pandas as pd
#Name of person“ nr"
#Place names“ ns"
#Institution name“ nt"

import os
#Get all file names under the folder
def get_filename(path,filetype):  # Enter path, file type, for example'.xlsx'
    name = []
    for root,dirs,files in os.walk(path):
        for i in files:
            if os.path.splitext(i)[1]==filetype:
                name.append(i)
    return name            # Outputs a list of file names with suffixes

#Add custom corpus: author name, Dynasty and year
def add_user_dict():
    CustomDictionary = JClass("com.hankcs.hanlp.dictionary.CustomDictionary")
    #Add author name
    author_data=pd.read_excel('./data2/author_new.xlsx')
    name=author_data.author
    for it in name:
        CustomDictionary.add(it,"nr")
    #Add time word
    time=[]
    file = 'data3/'
    lists = get_filename(file, '.xlsx')
    for it in lists:
        newfile = file + it
        dd=pd.read_excel(newfile).year_hao
        time.extend(dd)
    #print(time)
    for t in time:
        #print(t)
        CustomDictionary.add(t,"t")

#Process the author's key information: time, person, place, event
def key_print(lists,text,new_author):
    time = []
    where = []
    author = []
    move=[]
    for it in lists:
        simple = it.split('/')
        if simple[1] == 't':
            time.append(simple[0])
        elif simple[1] == 'nr':
            author.append(simple[0])
        elif simple[1] == 'ns':
            where.append(simple[0])
        elif simple[1] == 'v':
            move.append(simple[0])

    if len(time)!=0 and ( len(move)!=0 or len(where)!=0 ):
        newtime=""
        for it in time:
            if bool(re.search(r'\d',it)) and it.find('year')!=-1:
                newtime=it
        f1=False
        if newtime!="":
            f1=True
            #print("Time:" + newtime)
            #Save time
            data_list.append(newtime)
        else:
            if len(data_list)!=0:
                f1=True
                #print("Time:"+str(data_list[len(data_list)-1]))
                data_list.append(data_list[len(data_list)-1])
        if f1:
            #Time must be available to collect the following data
            if len(author)!=0:
                author=list(set(author))
                author_list.append(",".join(author))
                #print("character:"+" ".join(author))
            else:
                author_list.append(new_author)
                #print("character:"+new_author)
            if len(where)!=0:
                where_list.append(",".join(where))
                #print("Location:" + " ".join(where))
            else:
                where_list.append("nothing")
                #print("Location: None")
            #Handling events
            things=[]
            if len(move)!=0:
                thing_list=re.split('[,. ;]+',text)
                for v in move:
                    for it in thing_list:
                        if it.find(v)!=-1:
                                things.append(it)
                #duplicate removal
                set_things=list(set(things))
                things_list.append(",".join(set_things))
                #print("event:")
                #print(set_things)
            else:
                things_list.append("nothing")
                #print("Event: None")
    #Events: actions+character+place


#use crf Model extraction part of speech
def demo_CRF_lexical_analyzer(text,new_author):
    global bg_time
    CRFnewSegment = HanLP.newSegment("crf")
    term_list = CRFnewSegment.enableCustomDictionaryForcing(True).seg(text)
    ans=[]
    #'p'preposition
    #'ns'place name
    #'t'Time word
    #'nz'Other proper names
    #'j'Abbreviation
    #'m'numeral
    #'n'noun
    #At least there must be time words and characters
    f1=False
    f2=False
    lists=['n','nr','v','nz','ns','t']
    tmp=[]
    for it in term_list:
        if str(it.nature) in lists:
            tmp.append(str(it.word)+"/"+str(it.nature))
            if str(it.nature)=='t':
                if bool(re.search(r'\d',it.word)):
                    #Save this step bg_time It is to provide time for events without specific time in the future
                    bg_time=str(it.word)
                    f1 = True
            elif str(it.nature)=='ns':
                f2=True
    if f1:
        #print(tmp)
        key_print(tmp,text,new_author)
    else:
        if f2:
            tmp.append(bg_time+"/t")
            #print(tmp)
            key_print(tmp,text,new_author)
import xlwt

#Analyze the poet's personal life
def author_identity(text,author):
    lists = text.split(". ")

    for it in lists:
        #print(it)
        demo_CRF_lexical_analyzer(it,author)
    #print("============Analysis results================")
    #print(len(data_list))
    #for i in range(len(data_list)):
        #print(data_list[i], author_list[i], where_list[i], things_list[i])

    #print("===========Save data================")
    if len(data_list)!=0:
        xl = xlwt.Workbook()
        # Of the calling object add_sheet method
        sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

        sheet1.write(0, 0, "data")
        sheet1.write(0, 1, "author")
        sheet1.write(0, 2, "wheres")
        sheet1.write(0, 3, "things")
        for i in range(0, len(data_list)):
            sheet1.write(i + 1, 0, data_list[i])
            sheet1.write(i + 1, 1, author_list[i])
            sheet1.write(i + 1, 2, where_list[i])
            sheet1.write(i + 1, 3, things_list[i])

        xl.save("./author2/"+author+".xlsx")

#Clean the author, as long as it contains the author information of personal life
def read_author():
    author_list = pd.read_excel('./data2/author_new.xlsx').fillna('nothing')
    author_name = author_list.author
    author_experience = author_list.experience
    for i in range(len(author_experience)):
        if author_experience[i]!='nothing':
            new_author_name.append(author_name[i])
            new_author_experience.append(author_experience[i])

#test
def read_one():
    text='Han Shizhong, a young general, came from an ordinary peasant family. Since childhood, I like to practice martial arts and study hard and seriously. He had great strength in his youth. He is honest and kind-hearted and likes to uphold justice. Do not admire fame. When Han Shizhong was 16 or 7 years old, he was tall, energetic and brave. Someone in his hometown told him that he should serve his country as a soldier with such good Kung Fu. So at the age of 17, he joined the army and became a soldier. Han Shizhong's troops were stationed in the northwest and often clashed with the Xixia army. Han Shizhong took part in the battle soon after he joined the army. Because of his bravery in the battle, he was promoted from a soldier to a small captain, managing only a dozen people. Although his official position is small, Han Shizhong is still active and responsible. The soldiers he leads are older than him, but Han Shizhong is brave, fair and decent, and speaks with integrity, so everyone listens to him. Once, the song army attacked a city in Xixia. Han Shizhong's eyes turned red. A man climbed the wall and rushed in, killed the enemy leader guarding the city and threw his head out of the city. Inspired, the song army rushed up and attacked the city. Soon, the son-in-law, the supervisor of the Xixia king, led the Xia army to fight back against the song army, which was timid. Han Shizhong asked about the identity and role of the son-in-law, and then led several soldiers who dared to die into the enemy array. The sudden impact made the enemy panic. Han Shizhong rushed into the enemy array and went straight to the Marshal's account. Before the Xixia soldiers understood, he cut off the head of the son-in-law of the Xixia supervisor. Xixia soldiers were in chaos and scrambled to flee. Song generals praised Han Shizhong's bravery, saying that although he was young, he was a rare general. Therefore, the economic envoy reported to the court and asked for an exceptional promotion of Han Shizhong. However, Tong Guan, who presided over the border affairs at that time, doubted the authenticity of the report and only agreed to give Han Shizhong a promotion. In 1121 (the third year of Xuanhe), the troops sent by the song regime fought with the Jin soldiers in the south of Yanshan. Several soldiers and horses were defeated by the golden soldiers. Han Shizhong led more than 50 people to patrol on the Hutuo River. Unfortunately, he met the Jinbing brigade. The golden soldier is the main force of a cavalry of 2000 people. Han Shizhong was calm and decisive. He told his soldiers, "panic is death. Don't move. I'll arrange everything." He asked a team leader named sug to lead some people to seize the high slope, line up on it and watch without moving. More than ten knights were sent to organize the scattered song army who were preparing to cross the river, and hundreds of them were asked to line up to beat drums and shout. Then, he led several dare to die knights and rushed straight into the golden army formation. He cut down the golden soldiers who beat the flag. After killing several, the rest who raised the flag put down the flag one after another. The sergeant song by the river beat drums and shouted, "the golden soldiers are defeated! The golden soldiers are defeated!" At that moment, the golden soldiers were in chaos. Su Ge led the cavalry occupying the highland to kill them from top to bottom. The golden soldiers left hundreds of bodies and fled north one after another. Han Shizhong chased another way before he stopped the mount. In October 1126 (the first year of Jingkang), Han Shizhong, who was defending the Hutuo River, was chased by tens of thousands of Jin soldiers and retreated into Zhaozhou city. The enemy soldiers besieged the city. There are few soldiers in the city, lack of food and unstable morale. Some people advocate abandoning the city and fleeing. Han Shizhong sends an order. Those who dare to abandon the city will be beheaded. That night, it was snowing heavily. Han Shizhong chose 300 elite soldiers and left the city quietly. He sneaked into the camp of the commander of the Jinbing siege and killed the commander. Then he sneaked into the Jinbing camp and provoked the wrong attack and killing inside the Jinbing. During the one night war, more than half of the Jin soldiers were killed and wounded. When they learned that the Lord general was killed, they saw the bodies of their brothers everywhere. The blood dyed the snow red. The Jin soldiers had no intention to fight again and scattered and retreated. Han Shizhong has persisted in the anti Jin struggle in Hebei for several years. His rank is not high, and he leads few troops, but he is invincible and invincible. Therefore, his prestige intimidates the Jin soldiers. After rescuing Gaozong Jingkang, Kaifeng fell. Song Huizong and song qinzong were captured by Jin Bing. Zhao Gou, the ninth son of Emperor Huizong, became emperor in Nanjing (Shangqiu) for Emperor Gaozong of Song Dynasty. This was the first emperor of the small court of the Southern Song Dynasty. Zhao Gou doesn't want to make a difference. He just wants to live in peace. After taking office in Shangqiu, he was chased by Jinbing all the way, running from Shangqiu to Yangzhou, from Yangzhou to Hangzhou, and finally to the sea to avoid Jinbing. Under the resistance of Yue Fei and other generals, Jin Bing withdrew from the south of the Yangtze River, and Zhao Gou returned to Hangzhou from the sea. In order to calm public opinion, he dismissed Wang Boyan and Huang Qianshan, prime ministers of the capitulationists, Ren Zhu SHENGFEI as prime minister, Wang Yuan in charge of the Privy Council, and LV Yihao as an envoy to appease Jiangdong. General Miao Fu and Liu Zhengyan, dissatisfied with the imperial court, launched a mutiny, killed Wang Yuan and eunuch Kang Lu, and forced Gaozong to give way to his three-year-old son. LV Yihao asked Han Shizhong, Zhang Jun and other generals to quell the rebellion and rescue Gaozong. Han Shizhong didn't have many soldiers around him. He collected scattered soldiers around Yancheng, organized dozens of troops and came to Changshu from the sea. Zhang Jun and others entered Xiuzhou, and then pretended to suspend the army and stopped moving forward. Actually secretly prepare for the siege. Knowing that Han Shizhong was coming, Miao Fu and Liu Zhengyan captured Han Shizhong's wife Liang Hongyu as hostages. Prime Minister Zhu SHENGFEI has pretended to yield to Miao and Liu and told them that instead of forcing Han Shizhong to fight, he would not send Liang Hongyu to comfort Han Shizhong. As long as Han Shizhong can fall, great things can be accomplished! Miao and Liu sure enough asked messengers to follow Liang Hongyu to see Han Shizhong. Liang returned to her husband. After the messenger arrived, Han Shizhong burned the imperial edict, cut down the messenger and ordered to attack Hangzhou. Han Shizhong defeated the rebel defense forces in Beiguan, Hangzhou. Miao and Liu were frightened and led 2000 main forces to escape. Han Shizhong rescues Gaozong Zhao Gou. Gaozong tells him that Wu Zhan, Miao and Liu, the Chinese army in the palace, are a gang. If the thief is not eliminated, the palace will be disturbed. Just at this time, Wu Zhan led the troops to the former Song Dynasty to meet Han Shizhong. Han Shizhong held hands with Han Shizhong. Han Shizhong had great strength. He pinched Wu Zhan's finger, ordered him to take it, and killed him in the city Cao together with other traitors. The rebellion of Miao and Liu in Suiping stabilized the small court of the Southern Song Dynasty. Han Shizhong made the greatest contribution. From then on, he became a confidant of Gaozong and was appointed as the festival envoy of wusheng army and the control of Yuying left army. The pacification established Han Shizhong's reputation and status among the generals of the Southern Song Dynasty. In 1129 (the third year of Jianyan), Jin Bing went south again, broke through the natural danger of the Yangtze River, broke through important towns such as Jiankang (now Nanjing), and song Gaozong Zhao Gou hiding in Hangzhou had to escape. Han Shizhong met Gao Zong and said generously, "the country has lost Hebei, Hedong and Shandong. Where else can we go if we lose Jianghuai again?"Zhao Gou couldn't listen at all. All he wanted was to save his life. Zhao Gou appointed Han Shizhong as the institutional envoy of Western Zhejiang to defend Zhenjiang, while Zhao Gou fled to the sea with the forces of surrender. Zhenjiang was already behind the enemy at that time. Han Shizhong took orders and led only 8000 people to rush to Zhenjiang.Jin Bing retreated one after another after plundering in the south of the Yangtze River for a while. Han Shizhong stationed in Songjiang, Jiangwan and Haikou. Hearing the news of Jin Bing's retreat, Han Shizhong immediately dispatched troops to guard the important places and prepared to take the opportunity to kill Jin Bing. The song soldiers in ambush almost caught Marshal Wu Shu alive. Wu Shu is a belligerent general. He gave Han Shizhong a Book of war and made an appointment to fight. Han Shizhong agreed a date with the enemy and fought in the river. Because Jin Bing didn't learn water warfare, Han Shizhong took advantage of the enemy's weakness to block the Yangtze River. He defeated Jin Bing several times and captured Wu Shu's son-in-law King Longhu alive. Wu Shu did not dare to fight again. He led 100000 troops and horses back to Huang tiandang in an attempt to escape across the north of the river from here. Huang tiandang is a broken port in the river. It has long been abandoned. There is only a way in, not a way out. Han Shizhong saw that Jin Bing had strayed into the wrong way, so he seized this rare opportunity. After Jin Bing entered, he immediately led his soldiers to block the exit. Wu Shu and Jin Bing were trapped in Huang tiandang and had no way to retreat. Seeing that 100000 soldiers were about to starve to death, Wu Shu sent envoys to make peace with Han Shizhong. They were willing to return all the looted property and offer a BMW to Han Shizhong. On this condition, Han Shizhong refused to change his way back. Wu Shu had to offer a large reward. Wu Shu bought a good strategy from a traitor. There is an old stork River in huangtiandang, which leads directly to Jiankang Qinhuai River. It is silted up because it has not been used for a long time. Send someone to dig it through and escape from the waterway. Wu Shu sent people to dig through the river overnight in an attempt to enter Jiankang from the waterway. Passing through Niutou mountain, Yue Fei, who had just recovered Jiankang, stationed an army here. Seeing the enemy coming out from here, he immediately mobilized a large army to attack, and Wu Shu had to return to Huang tiandang. Han Shizhong is ready to kill the enemy. He sends people to make iron ropes and hooks. Once he meets the enemy ship, he will destroy it. Seeing that the enemy had nothing to do but wait to die, another traitor offered advice to Jin Bing and taught them to concentrate rockets on firing sails and burning song warships when they sailed by the song army, so that they could escape from Huang tiandang. Wu Shu was overjoyed and acted according to the plan. As expected, many song army ships were burned. Jin Bing took the opportunity to rush out of Huang tiandang, escape north across the Yangtze River and withdraw to the area north of the Yellow River. Han Shizhong used only 8000 troops, trapped 100000 troops and horses in Huang tiandang, fought for 48 days and annihilated more than 10000 enemies. The war was of great significance, aroused the anti gold sentiment of the Jianghuai people, and made the people see that the gold soldiers were not terrible. Han Shizhong defeated the enemy skillfully because of the Huang tiandang campaign, and his majestic posture and general style spread all over the Jianghuai region. In 1134 (the fourth year of Shaoxing), Han Shizhong, Ren Jiankang, Zhenjiang and huaidong Xuanfu envoys were stationed in Zhenjiang. After Yue Fei recovered Xiangyang (now Xiangfan, Hubei Province) and other six counties, the puppet Qi Lord Liu Yu sent people to Jin for help. Jin Taizong Wan Yan Sheng ordered Marshal Zuo Jianjun, and WAN Yan zongbi led 50000 troops to join forces with the puppet Qi army from Huaiyang(Southwest of Pixian County, Jiangsu Province)In other places, the troops divided into two routes and attacked the Song Dynasty in the south. He tried to go to Chuzhou (now Anhui) with cavalry, Chengzhou (now Gaoyou, Jiangsu) with infantry, and then cross the river to attack Lin'an (now Hangzhou, Zhejiang). On September 26, the Jin army attacked Chuzhou (now Huai'an, Jiangsu). Han Shizhong, the Xuanfu envoy of huaidong in Song Dynasty, retreated from Chengzhou to Zhenjiang (now Jiangsu). The Song court dispatched Wei Liangchen, the Minister of the Ministry of works, to the Jin army to beg for peace, and ordered Han Shizhong to go to Yangzhou from the north of the town river to prevent the Jin army from crossing the river. On the fourth day of October, after Han Shizhong led his troops to Yangzhou, he immediately ordered the Ministry of Jieyuan to defend Chengzhou and invite Jin army infantry; He led the cavalry to Dayi town (now the northwest of Yangzhou, Jiangsu) to resist the golden cavalry. On the 12th, Wei Liangchen and his party passed Yangzhou. Han Shizhong deliberately showed the instruction to avoid the enemy and guard the river and pretended to return to Zhenjiang. After Wei Liangchen left, Han Shizhong immediately led Jing to Dayi Town, divided the troops into five formations in a swamp area, ambushed more than 20 places and prepared to attack the Jin army. The next day, Jin learned from Wei Liangchen that Han Shizhong had retreated to Zhenjiang, so he ordered the Ministry to ride hundreds of horses, such as talbo, to Jiangkou near Yangzhou and to the east of Dayi town. Han Shizhong personally rode lightly to challenge and lure the Jin army into the ambush area. Song ambushed everywhere. The Jin army was caught off guard and did nothing with his bow and knife. Han Shizhong ordered Jingqi to encircle and attack, and ordered the Weibei army to hold a long axe, chop people's chest up and horse's feet down. The Jin army was trapped in the mud, with heavy casualties. More than 200 people, including Jin general TA Bo, were captured, and most of the rest were annihilated. The good news spread to Hangzhou, and the ministers congratulated him. Gaozong ordered Han Shizhong and his ministries to reward them for their achievements. The victory of Dayi town was one of the 13 meritorious deeds of the Southern Song Dynasty. At that time, some commentators believed that it was the first martial arts of ZTE in the Southern Song Dynasty. Denouncing Qin Hui, there was always a struggle between resistance and surrender in the Southern Song Dynasty. The main war factions represented by generals such as Yue Fei and Han Shizhong refused to compromise and surrender and opposed peace talks with Kim; The literati forces led by Qin Hui and other literary ministers tried to settle in a corner. Therefore, they opposed the war of resistance, advocated proper agreement and peace, and finally went to the road of knee bending surrender. Han Shizhong, no matter how many soldiers he leads, never fears the golden soldiers. No matter where he is, he moves when he hears the police and fights when he sees the enemy. He insisted on crossing the Yangtze River and the Yellow River to recover all lost land. In 1140, under the situation of Jin Bing's wanton invasion to the south, Han Shizhong led a few troops to surround Huaiyang occupied by Jin Bing, and then defeated Jin Bing. The main force of Jin Bing was Fukou town. During this period, the Anti Japanese War faction had a slight advantage. Han Shizhong was granted the title of Taibao, the Duke of England and the envoy of Henan and Hebei provinces. While Han Shizhong was recruiting troops and expanding his team, the situation turned sharply downward. The capitulationists won the support of song Gaozong, because the anti Jin army led by Yue Fei has gained great momentum in the Central Plains. Song Gaozong was worried that once he defeated Jin Bing, he would welcome back his father Emperor (Huizong) and brother (qinzong). Therefore, with his support, Qin Hui took over the military power of three anti Jin generals, Han Shizhong, Yue Fei and Zhang Jun. Qin Hui sent 12 gold medals in a day and ordered Yue Fei, who was at the forefront of the anti gold war, to go back to Lin'an. Because Han Shizhong was kind to song Gaozong, he was promoted as a secret envoy, which was actually depriving him of his military power. Yue Fei and his son were arrested and sent to prison. Qin Hui dominated the government, and no one dared to speak. However, Han Shizhong ignored this. He saw Qin Hui face to face and scolded him face to face: "what is the crime of Yue Fei and his son? Why are they detained?" Qin Hui replied, "although Fei Ziyun and Zhang Xianshu are unknown, the matter is unnecessary.". Han Shizhong denounced:“'Unnecessary'Can three words convince the world? " His good friend advised him that if he offended Qin Hui, he could not escape revenge in the future, but Han Shizhong said, "now I am attached to the traitor for myself. Won't I be punished by the iron staff of Taizu after death?" When Han Shizhong saw that Yue Fei and his son were executed, the excellent anti gold situation was lost in vain, and there was nothing he could do, he resolutely resigned from the post of privy envoy and drank away his worries all day. In his old age, he likes to release and grow old, and he calls himself a cool and cool recluse. Finally, he died of worry and anger and died in 1151 (the 21st year of Shaoxing).'
    author='Han Shizhong'
    author_identity(text, author)

#Start processing
if __name__ == '__main__':
    add_user_dict()
    #Extract the final author with personal experience
    new_author_name = []
    new_author_experience = []
    read_author()
    #How many outputs are there altogether
    print("altogether:"+str(len(new_author_name)))
    for i in range(len(new_author_name)):
        #Store last time
        bg_time = ""
        print("The first"+str(i)+"individual")
        author=new_author_name[i]
        text=new_author_experience[i]
        # time series
        data_list = []
        author_list = []
        where_list = []
        things_list = []
        author_identity(text,author)

The results are as follows: the analysis results of Li Bai's life

 

Category data

Get source

 

Crawling collection

 

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#Create header information
cipai=[]

for i in range(1,7):
    url='https://www.xungushici.com/cipais/p'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('ul',class_='list-unstyled d-flex flex-row flex-wrap align-items-center w-100')
    list=hed.find_all('li',class_="m-1 badge badge-light")

    for it in list:
        if it.a!=None:
            cipai.append(it.a.text)

import xlwt

xl = xlwt.Workbook()
# Of the calling object add_sheet method
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"title")
for i in range(0,len(cipai)):
    sheet1.write(i+1,0,cipai[i])

xl.save("cipai_name.xlsx")

Display effect

 

Qupai name data

Crawling collection

import pandas as pd
import xlwt



#read yuan Poetry of modern times
def read(file):
    data=pd.read_excel(file)
    title=data.title
    # Store a song ranking list
    qu_list=[]
    for it in title:
        if it.find('·')!=-1:
            # Obtain the corresponding Qupai name according to the poem name
            qu=it.split('·')
            qu_list.append(qu[0])
    new_qu=list(set(qu_list))
    #Save the song card name
    xl = xlwt.Workbook()
    # Of the calling object add_sheet method
    sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)
    sheet1.write(0, 0, "qu_name")

    for i in range(0, len(new_qu)):
        sheet1.write(i + 1, 0, new_qu[i])

    xl.save("qupai_name.xlsx")

Display effect:

 

Flying flower order

source

 

 

Crawling collection

import requests
from bs4 import BeautifulSoup
from lxml import etree

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#Create header information

hc=[]

url='https://www.xungushici.com/feihualings'
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
ul=soup.find('ul',class_='list-unstyled d-flex flex-row flex-wrap align-items-center w-100')
li_list=ul.find_all('li',class_='m-1 badge badge-light')
word=[]
for it in li_list:
    word.append(it.a.text)

import xlwt

xl = xlwt.Workbook()
# Of the calling object add_sheet method
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"word")

for i in range(0,len(word)):
    sheet1.write(i+1,0,word[i])

xl.save("word.xlsx")

Display effect

 

Poem - flying flower order

By traversing the 500000 ancient poems crawled, analyze whether each sentence contains the key words in the flying flower order. If so, store it: poem, author, poem name and key words

Difficulty: if xlwt is used for storage, 65536 rows of data can be stored at most, and 1 million rows of data can be stored with openpyxl. Because our verse data is too large, we need to use openpyxl for storage

import pandas as pd
import xlwt
import openpyxl

#Read flying order
def read_word():
    data=pd.read_excel('data2/word.xlsx')
    words=data.word
    return words

#Traversal verse
def read(file,words,write_file):
    data=pd.read_excel(file)
    title=data.title
    content=data.content
    author=data.author
    #Cut out a single sentence
    ans_sentens = []
    ans_author = []
    ans_title = []
    ans_key = []
    for i in range(len(title)):
        print("The first"+str(i)+"individual")
        cont=content[i]
        aut=author[i]
        tit=title[i]
        sents=cont.replace('\n','').split('. ')
        for it in sents:
            key_list = []
            for k in words:
                if it.find(k)!=-1:
                    key_list.append(k)
            if len(key_list)!=0:
                ans_sentens.append(it)
                ans_author.append(aut)
                ans_title.append(tit)
                ans_key.append(",".join(key_list))

    #Store the corresponding key,author,title,sentenous
    xl = openpyxl.Workbook()
    # Of the calling object add_sheet method
    sheet1 = xl.create_sheet(index=0)
    sheet1.cell(1, 1, "sentens")
    sheet1.cell(1, 2, "author")
    sheet1.cell(1, 3, "title")
    sheet1.cell(1, 4, "keys")

    for i in range(0, len(ans_key)):
        sheet1.cell(i + 2, 1, ans_sentens[i])
        sheet1.cell(i + 2, 2, ans_author[i])
        sheet1.cell(i + 2, 3, ans_title[i])
        sheet1.cell(i + 2, 4, ans_key[i])
    xl.save(write_file)
    print("Saved successfully to-"+write_file)

#Gets the folder under the specified folder excel
import os
def get_filename(path,filetype):  # Enter path, file type, for example'.xlsx'
    name = []
    for root,dirs,files in os.walk(path):
        for i in files:
            if os.path.splitext(i)[1]==filetype:
                name.append(i)
    return name            # Outputs a list of file names with suffixes

if __name__ == '__main__':
    file='data/'
    words=read_word()
    list = get_filename(file, '.xlsx')
    for i in range(len(list)):
        new_file=file+list[i]
        print(new_file)
        sentences_file = "sentences/sentence" + str(i+1) + ".xlsx"
        read(new_file,words,sentences_file)

Display effect

 

 

 

Summary

The above data is the data basis for constructing the knowledge map of ancient Chinese poetry. Through these data, the next entity node can be established, and then the relationship between entities can be created, which can be stored in the neo4j database. Of course, its data can also be imported into the mysql database first.

After the convenience of the front page display, neo4j database is suitable for displaying the relationship between entities and the establishment of question and answer system.

Added by straycat on Sat, 01 Jan 2022 22:10:41 +0200