Python-100-Days/Day66-75/code/example01.py

from urllib.error import URLError
from urllib.request import urlopen

import re
import pymysql
import ssl

from pymysql import Error


# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
def decode_page(page_bytes, charsets=('utf-8',)):
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)
            break
        except UnicodeDecodeError:
            pass
            # logging.error('Decode:', error)
    return page_html


# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):
    page_html = None
    try:
        page_html = decode_page(urlopen(seed_url).read(), charsets)
    except URLError:
        # logging.error('URL:', error)
        if retry_times > 0:
            return get_page_html(seed_url, retry_times=retry_times - 1,
                                 charsets=charsets)
    return page_html


# 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)
def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):
    pattern_regex = re.compile(pattern_str, pattern_ignore_case)
    return pattern_regex.findall(page_html) if page_html else []


# 开始执行爬虫程序并对指定的数据进行持久化操作
def start_crawl(seed_url, match_pattern, *, max_depth=-1):
    conn = pymysql.connect(host='localhost', port=3306,
                           database='crawler', user='root',
                           password='123456', charset='utf8')
    try:
        with conn.cursor() as cursor:
            url_list = [seed_url]
            visited_url_list = {seed_url: 0}
            while url_list:
                current_url = url_list.pop(0)
                depth = visited_url_list[current_url]
                if depth != max_depth:
                    page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312'))
                    links_list = get_matched_parts(page_html, match_pattern)
                    param_list = []
                    for link in links_list:
                        if link not in visited_url_list:
                            visited_url_list[link] = depth + 1
                            page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312'))
                            headings = get_matched_parts(page_html, r'<h1>(.*)<span')
                            if headings:
                                param_list.append((headings[0], link))
                    cursor.executemany('insert into tb_result values (default, %s, %s)',
                                       param_list)
                    conn.commit()
    except Error:
        pass
        # logging.error('SQL:', error)
    finally:
        conn.close()


def main():
    ssl._create_default_https_context = ssl._create_unverified_context
    start_crawl('http://sports.sohu.com/nba_a.shtml',
                r'<a[^>]+test=a\s[^>]*href=["\'](.*?)["\']',
                max_depth=2)


if __name__ == '__main__':
    main()
更新了爬虫第1天代码 2018-05-28 17:31:32 +08:00			`from urllib.error import URLError`
			`from urllib.request import urlopen`

			`import re`
			`import pymysql`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`import ssl`
更新了爬虫第1天代码 2018-05-28 17:31:32 +08:00
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`from pymysql import Error`
更新了爬虫第1天代码 2018-05-28 17:31:32 +08:00
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00
更新了爬虫第2天文档 2018-05-29 01:48:26 +08:00			`# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)`
			`def decode_page(page_bytes, charsets=('utf-8',)):`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`page_html = None`
			`for charset in charsets:`
			`try:`
			`page_html = page_bytes.decode(charset)`
			`break`
			`except UnicodeDecodeError:`
			`pass`
			`# logging.error('Decode:', error)`
			`return page_html`


更新了爬虫第2天文档 2018-05-29 01:48:26 +08:00			`# 获取页面的HTML代码(通过递归实现指定次数的重试操作)`
			`def get_page_html(seed_url, *, retry_times=3, charsets=('utf-8',)):`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`page_html = None`
更新了爬虫第1天代码 2018-05-28 17:31:32 +08:00			`try:`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`page_html = decode_page(urlopen(seed_url).read(), charsets)`
			`except URLError:`
			`# logging.error('URL:', error)`
			`if retry_times > 0:`
			`return get_page_html(seed_url, retry_times=retry_times - 1,`
			`charsets=charsets)`
			`return page_html`


更新了爬虫第2天文档 2018-05-29 01:48:26 +08:00			`# 从页面中提取需要的部分(通常是链接也可以通过正则表达式进行指定)`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`def get_matched_parts(page_html, pattern_str, pattern_ignore_case=re.I):`
			`pattern_regex = re.compile(pattern_str, pattern_ignore_case)`
			`return pattern_regex.findall(page_html) if page_html else []`


更新了爬虫第2天文档 2018-05-29 01:48:26 +08:00			`# 开始执行爬虫程序并对指定的数据进行持久化操作`
			`def start_crawl(seed_url, match_pattern, *, max_depth=-1):`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`conn = pymysql.connect(host='localhost', port=3306,`
			`database='crawler', user='root',`
			`password='123456', charset='utf8')`
			`try:`
			`with conn.cursor() as cursor:`
			`url_list = [seed_url]`
更新了爬虫第2天文档 2018-05-29 01:48:26 +08:00			`visited_url_list = {seed_url: 0}`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`while url_list:`
			`current_url = url_list.pop(0)`
更新了爬虫第2天文档 2018-05-29 01:48:26 +08:00			`depth = visited_url_list[current_url]`
			`if depth != max_depth:`
			`page_html = get_page_html(current_url, charsets=('utf-8', 'gbk', 'gb2312'))`
			`links_list = get_matched_parts(page_html, match_pattern)`
			`param_list = []`
			`for link in links_list:`
			`if link not in visited_url_list:`
			`visited_url_list[link] = depth + 1`
			`page_html = get_page_html(link, charsets=('utf-8', 'gbk', 'gb2312'))`
			`headings = get_matched_parts(page_html, r'<h1>(.*)<span')`
			`if headings:`
			`param_list.append((headings[0], link))`
			`cursor.executemany('insert into tb_result values (default, %s, %s)',`
			`param_list)`
			`conn.commit()`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`except Error:`
			`pass`
			`# logging.error('SQL:', error)`
			`finally:`
			`conn.close()`
更新了爬虫第1天代码 2018-05-28 17:31:32 +08:00

			`def main():`
更新了爬虫第1天的代码和文档 2018-05-28 23:21:07 +08:00			`ssl._create_default_https_context = ssl._create_unverified_context`
更新了爬虫第2天文档 2018-05-29 01:48:26 +08:00			`start_crawl('http://sports.sohu.com/nba_a.shtml',`
			`r'<a[^>]+test=a\s[^>]href=["\'](.?)["\']',`
			`max_depth=2)`
更新了爬虫第1天代码 2018-05-28 17:31:32 +08:00

			`if __name__ == '__main__':`
			`main()`