Beautiful soup 4 converts complex HTML documents into a complex tree structure. Each node is a Python object. All objects can be summarized into four types:
- Tag
- NavigableString
- BeautifulSoup
- Comment
Test file: baidu.com html
Please put the file contents in the root directory of the project
File content
<!DOCTYPE html> <html> <head> <meta content="text/html;charset=utf-8" http-equiv="content-type" /> <meta content="IE=Edge" http-equiv="X-UA-Compatible" /> <meta content="always" name="referrer" /> <link href="https://ss1.bdstatic.com/5eN1bjq8AAUVYm2zgY3K/r/ww/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css" /> <title>use Baidu Search,You know</title> </head> <body link="#0000cc"> <div id="wrapper"> <div id="head"> <div class="head_wrapper"> <div id="u1"> <a class="mnav" href="http://news. baidu. com" name="tj_ Trnews "> <! -- news -- ></a> <a class="mnav" href="http://news. baidu. com" name="tj_ Trnews "> News</a> <a class="mnav" href="https://www.hao123. com" name="tj_ trhao123" ">hao123</a><a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a> <a class="mnav" href="http://v.baidu. com" name="tj_ Trvideo "> Video</a> <a class="mnav" href="http://tieba. baidu,com" name="tj_ Trtieba "> Post Bar</a> <a class="bri" href=" //www.baidu. com/more/" name="tj_ briicon" style="... "> more products</a> </div> </div> </div> </div> </body> </html>
Screenshot of file open in browser:
Convert html file into tree structure bs
1.Tag tag and its content: get the first content it finds
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.title)
< title > Baidu, you will know < / Title >
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.a)
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.head)
<head>
<meta content="text/html;charset=utf-8" http-equiv="content-type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="always" name="referrer"/>
<link href="https://ss1.bdstatic.com/5eN1bjq8AAUVYm2zgY3K/r/ww/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/>
< title > Baidu, you will know < / Title >
</head>
type
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(type(bs.head))
<class 'bs4.element.Tag'>
2. Contents in navigablestring tag (string)
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.title) print(bs.title.string) print(type(bs.title.string))
< title > Baidu, you will know < / Title >
Baidu once, you know
<class 'bs4.element.NavigableString'>
Get all the attributes in a tag (Dictionary)
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.a.attrs)
{'class': ['mnav'], 'href': 'http://news.baidu.com', 'name': 'tj_trnews'}
3. Beautiful soup , represents the whole document
type
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(type(bs))
<class 'bs4.BeautifulSoup'>
name
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.name)
[document]
Tree document
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs)
<!DOCTYPE html> <html> <head> <meta content="text/html;charset=utf-8" http-equiv="content-type"/> <meta content="IE=Edge" http-equiv="X-UA-Compatible"/> <meta content="always" name="referrer"/> <link href="https://ss1.bdstatic.com/5eN1bjq8AAUVYm2zgY3K/r/ww/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/> <title>use Baidu Search,You know</title> </head> <body link="#0000cc"> <div id="wrapper"> <div id="head"> <div class="head_wrapper"> <div id="u1"> <a class="mnav" href="http://news. baidu. com" name="tj_ Trnews "> <! -- news -- ></a> <a class="mnav" href="http://news. baidu. com" name="tj_ Trnews "> News</a> <a "="" class="mnav" href="https://www.hao123. com" name="tj_ trhao123">hao123</a><a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a> <a class="mnav" href="http://v.baidu. com" name="tj_ Trvideo "> Video</a> <a class="mnav" href="http://tieba. baidu,com" name="tj_ Trtieba "> Post Bar</a> <a class="bri" href=" //www.baidu. com/more/" name="tj_ briicon" style="... "> more products</a> </div> </div> </div> </div> </body> </html>
4.Comment is a special NacigableString. The output content does not contain comment symbols
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read() bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.a.string) print(type(bs.a.string))
Journalism
<class 'bs4.element.Comment'>
Document traversal
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.head.contents)
['\n', <meta content="text/html;charset=utf-8" http-equiv="content-type"/>, '\n', <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>, '\n', <meta content="always" name="referrer"/>, '\n', <link href=" https://ss1.bdstatic.com/5eN1bjq8AAUVYm2zgY3K/r/ww/cache/bdorz/baidu.min.css "Rel =" stylesheet "type =" text / CSS "/ >, '\ n', < title > Baidu, you will know < / Title >, '\ n']
Get the list, and you can access the relevant elements with subscripts
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser print(bs.head.contents[1])
<meta content="text/html;charset=utf-8" http-equiv="content-type"/>
Traverse the document tree
- 5.1 .contents: get all child nodes of Tag and return a list
# Tag The content attribute can output the child nodes of tag as a list print(bs.head.contents) #Use the list index to get one of its elements print(bs.head.contents[1])
- 5.2 .children: get all child nodes of Tag and return a generator
for child in bs.body.children: print(child)
- 5.3,. descendants: get all descendant nodes of Tag
- 5.4,. Strings: if Tag contains multiple strings, that is, there is content in the descendant node, you can use this to obtain and then traverse
- 5.5,. stripped_strings: the usage is the same as that of strings, but it can remove those redundant blank contents
- 5.6,. Parent: get the parent node of Tag
- 5.7,. parents: recursively get all the nodes of the parent element and return a generator
- 5.8,. previous_sibling: get the previous T node of the current Tag. The attribute is usually string or blank. The real result is the stop sign and line feed between the current Tag and the previous Tag
- 5.9,. next_sibling: get the next node of the current Tag. The attribute is usually a string or blank. The real result is the stop sign and line feed between the current Tag and the next Tag
- 5.10,. previous_siblings: get all the sibling nodes above the current Tag and return a generator
- 5.11,. next_siblings: get all the following sibling nodes of the current Tag and return a generator
- 5.12,. previous_element: get the last parsed object (string or tag) in the parsing process, which may be different from previous_sibling is the same, but it is usually different
- 5.13,. next_element: get the next parsed object (string or tag) in the parsing process, which may be different from next_sibling is the same, but it is usually different
- 5.14,. previous_elements: returns a generator that can forward access the parsed content of the document
- 5.15,. next_elements: returns a generator that can access the parsed content of the document backwards
- 5.16,. has_attr: determine whether Tag contains attributes
Document search
(1)find_all() string filtering: it will find the content that exactly matches the string
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search #Find the links of all a tags and put them in a list t_list=bs.find_all("a") print(t_list)
[<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- > < / a >, < a class =" mnav "href =" http://news.baidu.com " name="tj_ Trnews "> News < / a >, < a" = "class =" mnav "href=“ https://www.hao123.com " name="tj_ trhao123">hao123</a>, <a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map < / a >, < a class =" mnav "href =" http://v.baidu.com " name="tj_ Trvideo "> Video < / a >, < a class =" mnav "href =" " http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar < / a >, < a class =" bri "href =" / / www.baidu.com com/more/" name="tj_ briicon" style="... "> more products < / a >]
Regular expression search: use the search() method to match the content
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search import re t_list=bs.find_all(re.compile("a")) #Contains all the contents of a print(t_list)
[<head>
<meta content="text/html;charset=utf-8" http-equiv="content-type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="always" name="referrer"/>
<link href="https://ss1.bdstatic.com/5eN1bjq8AAUVYm2zgY3K/r/ww/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/>
< title > Baidu, you will know < / Title >
</head>, <meta content="text/html;charset=utf-8" http-equiv="content-type"/>, <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>, <meta content="always" name="referrer"/>, <a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- > < / a >, < a class =" mnav "href =" http://news.baidu.com " name="tj_ Trnews "> News < / a >, < a" = "class =" mnav "href=“ https://www.hao123.com " name="tj_ trhao123">hao123</a>, <a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map < / a >, < a class =" mnav "href =" http://v.baidu.com " name="tj_ Trvideo "> Video < / a >, < a class =" mnav "href =" " http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar < / a >, < a class =" bri "href =" / / www.baidu.com com/more/" name="tj_ briicon" style="... "> more products < / a >]
Method: pass in a function (method) and search (understand) according to the requirements of the function
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search def name_is_exists(tag): return tag.has_attr("name") #name tag is required t_list=bs.find_all(name_is_exists) print(t_list)
[<meta content="always" name="referrer"/>, <a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- > < / a >, < a class =" mnav "href =" http://news.baidu.com " name="tj_ Trnews "> News < / a >, < a" = "class =" mnav "href=“ https://www.hao123.com " name="tj_ trhao123">hao123</a>, <a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map < / a >, < a class =" mnav "href =" http://v.baidu.com " name="tj_ Trvideo "> Video < / a >, < a class =" mnav "href =" " http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar < / a >, < a class =" bri "href =" / / www.baidu.com com/more/" name="tj_ briicon" style="... "> more products < / a >]
How to print a list
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search def name_is_exists(tag): return tag.has_attr("name") t_list=bs.find_all(name_is_exists) #print(t_list) for item in t_list: print(item)
<meta content="always" name="referrer"/>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
<a "="" class="mnav" href="https://www.hao123.com" name="tj_trhao123">hao123</a>
<a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a>
<a class="mnav" href=" http://v.baidu.com " name="tj_ Trvideo "> Video</a>
<a class="mnav" href=" http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar</a>
<a class="bri" href=" //www.baidu.com/more/" name="tj_briicon" style="..."> More products</a>
(2) kwargs parameter
I want to find the content of id="head"
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search t_list=bs.find_all(id="head") for item in t_list: print(item)
<div id="head">
<div class="head_wrapper">
<div id="u1">
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
<a "="" class="mnav" href=" https://www.hao123.com " name="tj_ trhao123">hao123</a><a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a>
<a class="mnav" href=" http://v.baidu.com " name="tj_ Trvideo "> Video</a>
<a class="mnav" href=" http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar</a>
<a class="bri" href=" //www.baidu.com/more/" name="tj_briicon" style="..."> More products</a>
</div>
</div>
</div>
I want to find content that contains the class class
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search t_list=bs.find_all(class_=True) for item in t_list: print(item)
<div class="head_wrapper">
<div id="u1">
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
<a "="" class="mnav" href=" https://www.hao123.com " name="tj_ trhao123">hao123</a><a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a>
<a class="mnav" href=" http://v.baidu.com " name="tj_ Trvideo "> Video</a>
<a class="mnav" href=" http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar</a>
<a class="bri" href=" //www.baidu.com/more/" name="tj_briicon" style="..."> More products</a>
</div>
</div>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
<a "="" class="mnav" href="https://www.hao123.com" name="tj_trhao123">hao123</a>
<a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a>
<a class="mnav" href=" http://v.baidu.com " name="tj_ Trvideo "> Video</a>
<a class="mnav" href=" http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar</a>
<a class="bri" href=" //www.baidu.com/more/" name="tj_briicon" style="..."> More products</a>
I want to find something=“ http://news.baidu.com "Content of"
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search t_list=bs.find_all(href="http://news.baidu.com") for item in t_list: print(item)
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
(3) text parameter
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser #Document search t_list=bs.find_all(text="hao123") for item in t_list: print(item)
hao123
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.find_all(text=["hao123","Map","Post Bar"]) for item in t_list: print(item)
hao123
Map
Post Bar
Apply regular expressions to find content that contains specific text (strings in tags)
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser import re #Apply regular expressions to find content that contains specific text (strings in tags) t_list=bs.find_all(text=re.compile("\d")) #\d represents a number for item in t_list: print(item)
hao123
Limit the number of lookups
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.find_all("a",limit=3) for item in t_list: print(item)
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
<a "="" class="mnav" href="https://www.hao123.com" name="tj_trhao123">hao123</a>
css selector
Find by label
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.select('title') #Find by label for item in t_list: print(item)
< title > Baidu, you will know < / Title >
Find by class name
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.select(".mnav") #Find by label for item in t_list: print(item)
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
<a "="" class="mnav" href="https://www.hao123.com" name="tj_trhao123">hao123</a>
<a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a>
<a class="mnav" href=" http://v.baidu.com " name="tj_ Trvideo "> Video</a>
<a class="mnav" href=" http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar</a>
Find by id
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.select("#u1") #Find by id for item in t_list: print(item)
<div id="u1">
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> <! -- news -- ></a>
<a class="mnav" href=" http://news.baidu.com " name="tj_ Trnews "> News</a>
<a "="" class="mnav" href=" https://www.hao123.com " name="tj_ trhao123">hao123</a><a class="mnav" href=" http://map.baidu.com " name="tj_ TRMAP "> map</a>
<a class="mnav" href=" http://v.baidu.com " name="tj_ Trvideo "> Video</a>
<a class="mnav" href=" http://tieba.baidu ,com" name="tj_ Trtieba "> Post Bar</a>
<a class="bri" href=" //www.baidu.com/more/" name="tj_briicon" style="..."> More products</a>
</div>
Find by attributes
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.select("a[class='bri']") #Find by attributes for item in t_list: print(item)
<a class="bri" href=" //www.baidu.com/more/" name="tj_briicon" style="..."> More products</a>
Find by subtag
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.select("head>title") #Find by subtag for item in t_list: print(item)
< title > Baidu, you will know < / Title >
Find by sibling tag
from bs4 import BeautifulSoup file=open("./baidu.html","rb") #Open document in binary mode html=file.read().decode("utf-8") #Read a document as an object bs=BeautifulSoup(html,"html.parser") #Parse the document with an html parser t_list=bs.select(".mnav ~ .bri") #Find by sibling tag print(t_list[0].get_text())
More products