The epidemic data were crawled and analyzed by python

Crawl the latest epidemic data with Python

The data of this major epidemic may change all the time. This blog will explain how to crawl real-time epidemic data, analyze the data and make the effect of data visualization.

Summary of the report:

Analysis of epidemic situation in China

1.1 display the specific situation of epidemic situation in each province (including cities in each province)

1.2 find out the cities with new epidemic situation in China

1.3 show the total number of people infected by the epidemic in each province

1.4 draw the cumulative infection distribution map of epidemic situation in China

The cumulative trend of epidemic situation in China was analyzed

2.1 draw local cumulative diagnosis trend chart

2.2 map the new trend of national infection (in recent months)

Epidemic analysis of other countries in the world

3.1 draw the distribution map of the total number of infected people in various countries around the world

Check the epidemic development of various countries

Step 1: epidemic data interface acquisition

Tencent News obtained and sorted out the epidemic tracking data of COVID-19 from the websites of WHO and Hopkins University. The API interface URL is:

url1 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
url2 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
url3 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_foreign"
url4 = "https://api.inews.qq.com/newsqa/v1/automation/foreign/daily/list?country = United States“

Step 2: import the necessary Libraries

import numpy as np  #Import the necessary library functions
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import requests
import json
from pyecharts.charts import Map, Geo
from PIL import Image

1. Analyze the local epidemic situation in China

Process: 1. First get the local epidemic data of China from the API interface

2. Because the 'data' data in the json file analyzed in advance is a string, it is convenient to convert 'data' into dictionary type for analysis

url1 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
resp=requests.get(url1)
listdata=[]
listdata=resp.json()
listdata1=json.loads(listdata['data']) #Convert 'data' into dictionary type for easy analysis
#Read the overall situation of the epidemic in China from the file
listtime=listdata1['lastUpdateTime']
pd_china=pd.DataFrame()      
pd1=pd.DataFrame(listdata1['chinaTotal'],index=['chinaTotal'], columns=['confirm', 'heal','dead','suspect','nowConfirm','nowSevere','importedCase','noInfect'])
pd_china=pd_china.append(pd1)
pd1=pd.DataFrame(listdata1['chinaAdd'],index=['chinaAdd'], columns=['confirm', 'heal','dead','suspect','nowConfirm','nowSevere','importedCase','noInfect'])
pd_china=pd_china.append(pd1)
pd_china['lastUpdateTime']=listtime
pd_china=pd_china.rename(columns={"confirm": "Cumulative diagnosis", "heal": "cure","dead":"Cumulative death","suspect":"Suspected patient","nowConfirm":"Existing patients","importedCase":"Overseas input","noInfect":"asymptomatic patient","lastUpdateTime":"Last updated","nowSevere":"patient in severe condition"})
pd_china=pd_china.rename(index={"chinaTotal":"China cumulative","chinaAdd":"New in China"})
pd_china

1.1 clean and analyze the data of domestic provinces and cities ¶

areaTree=listdata1['areaTree']  
china_data=areaTree[0]['children']  #Obtain data from provinces and cities in China
china_data
china_list = []
for a in range(len(china_data)):   
    province = china_data[a]['name']   #Get all the provinces
    province_list = china_data[a]['children']   #Get a list of cities in each province
    for b in range(len(province_list)):
        city = province_list[b]['name']  
        total = province_list[b]['total']
        today = province_list[b]['today']
        china_dict = {}              #Store the information of each city in a dictionary
        china_dict['province'] = province
        china_dict['city'] = city
        china_dict['total'] = total
        china_dict['today'] = today
        china_list.append(china_dict)
china_data = pd.DataFrame(china_list)
china_data['Last updated']=listtime
print('Epidemic situation in all provinces')
china_data

# Define data processing functions
def confirm(x):        # Output the value corresponding to 'confirm' in 'total' or 'today' (both dictionary type data) obtained above
    confirm = eval(str(x))['confirm']
    return confirm
def suspect(x):             # Output the value corresponding to 'prospect' in 'total' or 'today' (both dictionary type data) obtained above
    suspect = eval(str(x))['suspect']
    return suspect
def dead(x):              # Output the value corresponding to 'dead' in 'total' or 'today' (both dictionary type data) obtained above
    dead = eval(str(x))['dead']
    return dead
def heal(x):         # Output the value corresponding to 'heal' in 'total' or 'today' (both dictionary type data) obtained above
    heal =  eval(str(x))['heal']
    return heal
# Function mapping
china_data['confirm'] = china_data['total'].map(confirm)
china_data['suspect'] = china_data['total'].map(suspect)
china_data['dead'] = china_data['total'].map(dead)
china_data['heal'] = china_data['total'].map(heal)
china_data['addconfirm'] = china_data['today'].map(confirm)
china_data = china_data[["province","city","confirm","suspect","dead","heal","addconfirm"]]
china_data=china_data.rename(columns={"province":"province","city":"city","confirm":"Cumulative infection","suspect":"Suspected","dead":"death","heal":"cure","addconfirm":"New infection"})
print('Specific epidemic situation in all provinces')
china_data['Last updated']=listtime
china_data

1.2 find out the cities with new epidemic situation in China

china_data1=china_data[china_data['New infection']>=1]
china_data1

1.3 calculate the total number of infected people in each province and draw a histogram

area_data = china_data.groupby("province")["Cumulative infection"].sum().reset_index() 
area_data.columns = ["province","Cumulative infection"]
print('\n Comparison of the total number of infected people in various provinces\n')
print(area_data )
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # Display Chinese in bold
# mapping
plt.figure(figsize = (10,8),dpi=100)  #Resize image and pixels
plt.bar(x=0,bottom=area_data['province'],height=0.5,width=area_data['Cumulative infection'],orientation='horizontal',label='Number of people',color='red',alpha=0.5 )

#The specific values are displayed on the histogram. The ha parameter controls the horizontal alignment and va controls the vertical alignment
for x1, yy in zip(area_data['Cumulative infection'], area_data['province']):
  plt.text(x1+1, yy , str(x1),  va='center', fontsize=10, rotation=0)
# Set title
plt.title("Total number of infected persons in each province")
# Sets the name for both axes
plt.xlabel("Number of infected persons")
plt.ylabel("province")
# Show Legend 
plt.legend(loc="upper right")
plt.show()
plt.close()

1.4 draw the cumulative infection distribution map of epidemic situation in China ¶

privince=[]
columns=[]
x=[]   # Match the number of infected people in each province with that in each province
for i in range(len(area_data)):   
    privince.append(area_data.loc[i]['province'])
    columns.append(int(area_data.loc[i]['Cumulative infection']))
for z in zip(list(privince), list(columns)):
    list(z)
    x.append(z)
from pyecharts.charts import Map
import pyecharts.options as opts
from pyecharts import options
from pyecharts.globals import ChartType
area_map = Map()
area_map.add("Distribution of epidemic infection in China",x, "china",is_map_symbol_show=False)
area_map.set_global_opts(title_opts=opts.TitleOpts(title="Distribution map of cumulative number of infections in China"),visualmap_opts=opts.VisualMapOpts(is_piecewise=True,
              pieces=[
                    {"min": 1500, "label": '>10000 people', "color": "#6F171F"}, 
                    {"min": 500, "max": 15000, "label": '500-1000 people', "color": "#C92C34"},
                    {"min": 100, "max": 499, "label": '100-499 people', "color": "#E35B52"},
                    {"min": 10, "max": 99, "label": '10-99 people', "color": "#F39E86"},
                    {"min": 1, "max": 9, "label": '1-9 people', "color": "#FDEBD0"}]))
area_map.render_notebook()

2 analyze the cumulative trend of epidemic situation in China

url2 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
resp=requests.get(url2)
listdata=[]
listdata=resp.json()
listdata2=json.loads(listdata['data'])
chinaDayList=listdata2['chinaDayList']  #Put the epidemic data recorded in China in the original data text (dictionary form) into a sequence
i=len(chinaDayList) #Calculate the number of days from the beginning of epidemic data statistics to today
china_date=pd.DataFrame()
for n in range(i):
    pd1=pd.DataFrame(data=chinaDayList[n],index=[n],columns=['confirm','dead','heal','nowConfirm','nowSevere','healRate','date'])
    china_date=china_date.append(pd1)
china_date=china_date.rename(columns={"confirm":"Cumulative diagnosis","dead":"Cumulative death","heal":"Cumulative cure","nowConfirm":"Existing diagnosis","nowSevere":"Local new","healRate":"cure rate","date":"date"})
print('\n Cumulative national epidemic situation (show the last five)')
china_date.tail()

2.1 draw local cumulative diagnosis trend chart

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(10,4),dpi=90)    #Resizing, clarity
plt.xticks(rotation=70)      #Font tilt
x=np.array(china_date['date'])
y=np.array(china_date['Cumulative diagnosis'])
plt.xticks(range(0,i,4))
plt.plot(x,y)
plt.title('Cumulative trend of national epidemic situation')
plt.xlabel("Date")
plt.ylabel("Number of infected persons")
plt.show()

2.2 map the new trend of national infection (in recent months)

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(10,4),dpi=90)    #Resizing, clarity
plt.xticks(rotation=70)      #Font tilt
x=np.array(china_date['date'])
y=np.array(china_date['Local new'])
plt.xticks(range(0,i,4))
plt.plot(x,y)
plt.title('National trend of new infections')
plt.xlabel("Date")
plt.ylabel("Number of infected persons")
plt.show()

3. Analysis of epidemic situation in other countries around the world

url3 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_foreign"
resp=requests.get(url3)
listdata=[]
listdata=resp.json()
listdata3=json.loads(listdata['data'])

areaTree=listdata3['foreignList']  #Obtain total data for each country
country_list=pd.DataFrame()
for a in range(len(areaTree)):   
    pd1=pd.DataFrame(areaTree[a],index=[a],columns=['name','continent','y','date','confirmAdd','confirm','dead','heal','nowConfirm'])
    country_list=country_list.append(pd1)
country_list=country_list.rename(columns={"name":"name","continent":"Continent","y":"year","date":"date","confirmAdd":"newly added","confirm":"Infected","dead":"death","heal":"cure","nowConfirm":"Number of existing infections"})
print('\n Show the first 10 records')
country_list.head(10)

3.1 draw the distribution map of the total number of infected people in various countries around the world

# Create an empty list to hold relevant data for each country
info_list = []
name2=[]
confirm2=[]
# Traversal polling information for each country
for a in range(len(areaTree)):    # Extract country name
    name = areaTree[a]['name']
    name2.append(name)  #  Extract the number of confirmed cases in each country
    confirm = int(areaTree[a]["confirm"])
    confirm2.append(confirm)   #Put each country and the number of confirmed cases in a tuple and add them to the list
    info_tuple = (name,confirm)
    info_list.append(info_tuple) 
#  Match the Chinese name of each country with the English name  
nameMap = {
        'Singapore Rep.':'Singapore',
        'Dominican Rep.':'Dominican',
        'Palestine':'Palestine',
        'Bahamas':'Bahamas',
        'Timor-Leste':'Timor-Leste',
        'Afghanistan':'Afghanistan',
        'Guinea-Bissau':'Guinea-Bissau',
        "Côte d'Ivoire":'Cote d'Ivoire',
        'Siachen Glacier':'Siachin glacier',
        "Br. Indian Ocean Ter.":'British Indian Ocean Territory',
        'Angola':'Angola',
        'Albania':'Albania',
        'United Arab Emirates':'The United Arab Emirates',
        'Argentina':'Argentina',
        'Armenia':'Armenia',
        'French Southern and Antarctic Lands':'French Southern Hemisphere and Antarctic Territory',
        'Australia':'Australia',
        'Austria':'Austria',
        'Azerbaijan':'Azerbaijan',
        'Burundi':'burundi',
        'Belgium':'Belgium',
        'Benin':'Benin',
        'Burkina Faso':'burkina faso ',
        'Bangladesh':'The People's Republic of Bangladesh',
        'Bulgaria':'Bulgaria',
        'The Bahamas':'Bahamas',
        'Bosnia and Herz.':'Bosnia and Herzegovina',
        'Belarus':'Belarus',
        'Belize':'Belize ',
        'Bermuda':'Bermuda',
        'Bolivia':'bolivia',
        'Brazil':'Brazil',
        'Brunei':'Brunei',
        'Bhutan':'Bhutan',
        'Botswana':'botswana',
        'Central African Rep.':'Central African',
        'Canada':'Canada',
        'Switzerland':'Switzerland',
        'Chile':'Chile',
        'China':'China',
        'Ivory Coast':'Ivory Coast',
        'Cameroon':'Cameroon',
        'Dem. Rep. Congo':'Democratic Republic of the Congo',
        'Congo':'Congo',
        'Colombia':'Columbia',
        'Costa Rica':'Costa Rica',
        'Cuba':'Cuba',
        'N. Cyprus':'northern cyprus ',
        'Cyprus':'Cyprus',
        'Czech Rep.':'Czech Republic',
        'Germany':'Germany',
        'Djibouti':'Djibouti',
        'Denmark':'Denmark',
        'Algeria':'Algeria',
        'Ecuador':'Ecuador',
        'Egypt':'Egypt',
        'Eritrea':'Eritrea',
        'Spain':'Spain',
        'Estonia':'Estonia',
        'Ethiopia':'Ethiopia',
        'Finland':'Finland',
        'Fiji':'Fei',
        'Falkland Islands':'Falkland Islands',
        'France':'France',
        'Gabon':'Gabon',
        'United Kingdom':'britain',
        'Georgia':'Georgia',
        'Ghana':'Ghana',
        'Guinea':'Guinea',
        'Gambia':'Gambia',
        'Guinea Bissau':'Guinea-Bissau',
        'Eq. Guinea':'Equatorial Guinea',
        'Greece':'Greece',
        'Greenland':'Greenland',
        'Guatemala':'Guatemala',
        'French Guiana':'French Guiana',
        'Guyana':'Guyana',
        'Honduras':'Honduras',
        'Croatia':'Croatia',
        'Haiti':'Haiti',
        'Hungary':'Hungary',
        'Indonesia':'Indonesia',
        'India':'India',
        'Ireland':'Ireland',
        'Iran':'Iran',
        'Iraq':'Iraq',
        'Iceland':'Iceland',
        'Israel':'Israel',
        'Italy':'Italy',
        'Jamaica':'Jamaica',
        'Jordan':'Jordan',
        'Japan':'Japan',
        'Japan':'Native Japan',
        'Kazakhstan':'Kazakhstan',
        'Kenya':'Kenya',
        'Kyrgyzstan':'Kyrgyzstan',
        'Cambodia':'Cambodia',
        'Korea':'the republic of korea',
        'Kosovo':'Kosovo ',
        'Kuwait':'Kuwait',
        'Lao PDR':'Laos',
        'Lebanon':'Lebanon',
        'Liberia':'Liberia',
        'Libya':'Libya',
        'Sri Lanka':'Sri Lanka',
        'Lesotho':'Lesotho',
        'Lithuania':'Lithuania',
        'Luxembourg':'Luxembourg',
        'Latvia':'Latvia',
        'Morocco':'Morocco',
        'Moldova':'Moldova',
        'Madagascar':'Madagascar',
        'Mexico':'Mexico',
        'Macedonia':'Macedonia',
        'Mali':'Mali',
        'Myanmar':'Myanmar',
        'Montenegro':'Montenegro ',
        'Mongolia':'Mongolia',
        'Mozambique':'Mozambique',
        'Mauritania':'Mauritania',
        'Malawi':'Malawi',
        'Malaysia':'Malaysia',
        'Namibia':'Namibia',
        'New Caledonia':'New Caledonia ',
        'Niger':'Niger',
        'Nigeria':'Nigeria',
        'Nicaragua':'Nicaragua',
        'Netherlands':'Netherlands',
        'Norway':'Norway',
        'Nepal':'Nepal',
        'New Zealand':'New Zealand',
        'Oman':'Oman',
        'Pakistan':'Pakistan',
        'Panama':'Panama',
        'Peru':'Peru',
        'Philippines':'the Philippines',
        'Papua New Guinea':'papua new guinea',
        'Poland':'poland',
        'Puerto Rico':'Puerto Rico',
        'Dem. Rep. Korea':'Korea',
        'Portugal':'Portugal',
        'Paraguay':'Paraguay',
        'Qatar':'Qatar',
        'Romania':'Romania',
        'Russia':'Russia',
        'Rwanda':'Rwanda',
        'W. Sahara':'Western Sahara',
        'Saudi Arabia':'Saudi Arabia',
        'Sudan':'Sudan',
        'S. Sudan':'South Sudan',
        'Senegal':'Senegal',
        'Solomon Is.':'Solomon Islands',
        'Sierra Leone':'sierra leone',
        'El Salvador':'El Salvador',
        'Somaliland':'Somaliland',
        'Somalia':'Somalia',
        'Serbia':'Serbia',
        'Suriname':'Suriname',
        'Slovakia':'Slovakia',
        'Slovenia':'Slovenia',
        'Sweden':'Sweden',
        'Swaziland':'Eswatini',
        'Syria':'Syria',
        'Chad':'Chad',
        'Togo':'Togo',
        'Thailand':'Thailand',
        'Tajikistan':'Tajikistan',
        'Turkmenistan':'Turkmenistan',
        'East Timor':'Timor-Leste',
        'Trinidad and Tobago':'Trinidad and Tobago ',
        'Tunisia':'Tunisia',
        'Turkey':'turkey',
        'Tanzania':'Tanzania',
        'Uganda':'Uganda',
        'Ukraine':'Ukraine',
        'Uruguay':'Uruguay',
        'United States':'U.S.A',
        'Uzbekistan':'Uzbekistan',
        'Venezuela':'Venezuela',
        'Vietnam':'Vietnam',
        'Vanuatu':'Vanuatu',
        'West Bank':'west bank',
        'Yemen':'Yemen',
        'South Africa':'South Africa',
        'Zambia':'Zambia',
        'Zimbabwe':'zimbabwe'
}
during = [               # Define the level of the number of people infected
    {"min":10000000},
    {"min": 1000000, "max":9999999},
    {"min": 100000, "max": 999999},
    {"min": 10000, "max": 99999},
    {"min": 1000, "max": 9999},
    {"min":0,"max":999},
]
map = Map(opts.InitOpts(width='1000px',height='600px')).add(series_name="",
                # Data items to display
                data_pair=info_list,
                # The map type is world map
                maptype="world",
                # Custom read name mapping
                name_map=nameMap,
                # Display tag graphics
                is_map_symbol_show=False)
# label_opts: tag configuration item settings, is_show: whether to display the visual mapping configuration
map.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
 
#is_ Piece wise: whether it is displayed in segments. pieces: set the range of each interval
#max_:  Set the maximum value in all value ranges of visual mappiecewise. TitleOpts: Title Setting
map.set_global_opts(
        title_opts=opts.TitleOpts(pos_left="40px",title="World epidemic distribution map (excluding China)",subtitle_textstyle_opts=opts.TextStyleOpts(font_size=20)),
        legend_opts=opts.LegendOpts(selected_mode='multiple'),
        visualmap_opts=opts.VisualMapOpts(max_=10000000,is_piecewise=True,pieces=during),
        )
map.render_notebook()

4. Check the epidemic development of various countries

name=input('Countries to view: ')
#First find the API interface required by the country you want to view
url='https://api.inews.qq.com/newsqa/v1/automation/foreign/daily/list?country='      
url4 = url+str(name)
resp=requests.get(url4)    
listdata=[]
listdata=resp.json()   
listForeign=pd.DataFrame(listdata['data'])
listForeign=listForeign.rename(columns={"y":"year","date":"date","confirm_add":"New infections","confirm":"Total number of infected persons","heal":"cure","dead":"Total deaths"})

  #Convert date data of type "00.00" to data of type "00-00"
i=len(listForeign)
list=[]
for n in range(i):
  list.append(listForeign['date'][n].replace('.','-'))
del listForeign['date']
listForeign['date']=list
  # Merge year and time
listForeign['time']=listForeign['year'].astype(str)+'-'+listForeign['date'].astype(str)
del listForeign['date']
del listForeign['year']
listForeign=listForeign[listForeign['New infections']>0]
listForeign

matplotlib.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(12,4),dpi=90)    #Resizing, clarity
plt.xticks(rotation=70)      #Font tilt
i=len(listForeign)
x=np.array(listForeign['time'])
y=np.array(listForeign['New infections'])
plt.xticks(range(0,i,20))
plt.plot(x,y,color='red',label='New infections')
plt.title(name+'New trend chart of infection')
plt.xlabel("Date")
plt.ylabel("New infections")
plt.show()

Keywords: Python Back-end Data Analysis

Added by kritikia on Thu, 06 Jan 2022 16:29:38 +0200