2018-05-28 17:31:32 +08:00
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
html = """
|
|
|
|
|
<!DOCTYPE html>
|
|
|
|
|
<html lang="en">
|
2018-05-29 01:48:26 +08:00
|
|
|
|
<head>
|
|
|
|
|
<meta charset="UTF-8">
|
|
|
|
|
<title>首页</title>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
<h1>Hello, world!</h1>
|
2018-05-29 16:58:46 +08:00
|
|
|
|
<p>这是一个<em>神奇</em>的网站!</p>
|
2018-05-29 01:48:26 +08:00
|
|
|
|
<hr>
|
|
|
|
|
<div>
|
|
|
|
|
<h2>这是一个例子程序</h2>
|
|
|
|
|
<p>静夜思</p>
|
|
|
|
|
<p class="foo">床前明月光</p>
|
|
|
|
|
<p id="bar">疑似地上霜</p>
|
|
|
|
|
<p class="foo">举头望明月</p>
|
|
|
|
|
<div><a href="http://www.baidu.com"><p>低头思故乡</p></a></div>
|
|
|
|
|
</div>
|
|
|
|
|
<a class="foo" href="http://www.qq.com">腾讯网</a>
|
|
|
|
|
<img src="./img/pretty-girl.png" alt="美女">
|
|
|
|
|
<img src="./img/hellokitty.png" alt="凯蒂猫">
|
|
|
|
|
<img src="/static/img/pretty-girl.png" alt="美女">
|
|
|
|
|
<table>
|
|
|
|
|
<tr>
|
|
|
|
|
<th>姓名</th>
|
|
|
|
|
<th>上场时间</th>
|
|
|
|
|
<th>得分</th>
|
|
|
|
|
<th>篮板</th>
|
|
|
|
|
<th>助攻</th>
|
|
|
|
|
</tr>
|
|
|
|
|
</table>
|
|
|
|
|
</body>
|
2018-05-28 17:31:32 +08:00
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
2018-05-29 01:48:26 +08:00
|
|
|
|
# JavaScript - document.title
|
2018-05-28 17:31:32 +08:00
|
|
|
|
print(soup.title)
|
2018-05-29 01:48:26 +08:00
|
|
|
|
# JavaScript - document.body.h1
|
2018-05-28 17:31:32 +08:00
|
|
|
|
print(soup.body.h1)
|
2018-05-29 16:58:46 +08:00
|
|
|
|
print(soup.p)
|
|
|
|
|
print(soup.body.p.text)
|
|
|
|
|
print(soup.body.p.contents)
|
|
|
|
|
for p_child in soup.body.p.children:
|
|
|
|
|
print(p_child)
|
|
|
|
|
print(len([elem for elem in soup.body.children]))
|
|
|
|
|
print(len([elem for elem in soup.body.descendants]))
|
|
|
|
|
print(soup.findAll(re.compile(r'^h[1-6]')))
|
|
|
|
|
print(soup.body.find_all(r'^h'))
|
|
|
|
|
print(soup.body.div.find_all(re.compile(r'^h')))
|
2018-05-29 01:48:26 +08:00
|
|
|
|
print(soup.find_all(re.compile(r'r$')))
|
2018-05-28 17:31:32 +08:00
|
|
|
|
print(soup.find_all('img', {'src': re.compile(r'\./img/\w+.png')}))
|
|
|
|
|
print(soup.find_all(lambda x: len(x.attrs) == 2))
|
2018-05-29 16:58:46 +08:00
|
|
|
|
print(soup.find_all(foo))
|
2018-05-28 17:31:32 +08:00
|
|
|
|
print(soup.find_all('p', {'class': 'foo'}))
|
|
|
|
|
for elem in soup.select('a[href]'):
|
|
|
|
|
print(elem.attrs['href'])
|
|
|
|
|
|
|
|
|
|
|
2018-05-29 16:58:46 +08:00
|
|
|
|
def foo(elem):
|
|
|
|
|
return len(elem.attrs) == 2
|
|
|
|
|
|
|
|
|
|
|
2018-05-28 17:31:32 +08:00
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|