diff --git a/Day66-75/Scrapy的应用01.md b/Day66-75/Scrapy爬虫框架的应用.md similarity index 73% rename from Day66-75/Scrapy的应用01.md rename to Day66-75/Scrapy爬虫框架的应用.md index 644d460..d566d63 100644 --- a/Day66-75/Scrapy的应用01.md +++ b/Day66-75/Scrapy爬虫框架的应用.md @@ -1,4 +1,4 @@ -## Scrapy的应用(01) +## Scrapy爬虫框架的应用 ### Scrapy概述 @@ -101,6 +101,11 @@ $ 2. 在spiders文件夹中编写自己的爬虫。 + ```Shell + + (venv) $ scrapy genspider movie movie.douban.com --template=crawl + ``` + ```Python # -*- coding: utf-8 -*- @@ -287,5 +292,77 @@ $ HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ``` - +### 补充说明 + +#### XPath语法 + +1. XPath路径表达式:XPath使用路径表达式来选取XML文档中的节点或者节点集。 + +2. XPath节点:元素、属性、文本、命名空间、处理指令、注释、根节点。 + +3. XPath语法。(注:下面的例子来自于[菜鸟教程](http://www.runoob.com/)网站的[XPath教程](http://www.runoob.com/xpath/xpath-syntax.html)。) + + XML文件。 + + ```XML + + + + + + + Harry Potter + 29.99 + + + + Learning XML + 39.95 + + + + ``` + XPath语法。 + + | 路径表达式 | 结果 | + | --------------- | ------------------------------------------------------------ | + | bookstore | 选取 bookstore 元素的所有子节点。 | + | /bookstore | 选取根元素 bookstore。注释:假如路径起始于正斜杠( / ),则此路径始终代表到某元素的绝对路径! | + | bookstore/book | 选取属于 bookstore 的子元素的所有 book 元素。 | + | //book | 选取所有 book 子元素,而不管它们在文档中的位置。 | + | bookstore//book | 选择属于 bookstore 元素的后代的所有 book 元素,而不管它们位于 bookstore 之下的什么位置。 | + | //@lang | 选取名为 lang 的所有属性。 | + + XPath谓词。 + + | 路径表达式 | 结果 | + | ---------------------------------- | ------------------------------------------------------------ | + | /bookstore/book[1] | 选取属于 bookstore 子元素的第一个 book 元素。 | + | /bookstore/book[last()] | 选取属于 bookstore 子元素的最后一个 book 元素。 | + | /bookstore/book[last()-1] | 选取属于 bookstore 子元素的倒数第二个 book 元素。 | + | /bookstore/book[position()<3] | 选取最前面的两个属于 bookstore 元素的子元素的 book 元素。 | + | //title[@lang] | 选取所有拥有名为 lang 的属性的 title 元素。 | + | //title[@lang='eng'] | 选取所有 title 元素,且这些元素拥有值为 eng 的 lang 属性。 | + | /bookstore/book[price>35.00] | 选取 bookstore 元素的所有 book 元素,且其中的 price 元素的值须大于 35.00。 | + | /bookstore/book[price>35.00]/title | 选取 bookstore 元素中的 book 元素的所有 title 元素,且其中的 price 元素的值须大于 35.00。 | + + 通配符用法。 + + | 路径表达式 | 结果 | + | ------------ | --------------------------------- | + | /bookstore/* | 选取 bookstore 元素的所有子元素。 | + | //* | 选取文档中的所有元素。 | + | //title[@*] | 选取所有带有属性的 title 元素。 | + + 选取多个路径。 + + | 路径表达式 | 结果 | + | -------------------------------- | ------------------------------------------------------------ | + | //book/title \| //book/price | 选取 book 元素的所有 title 和 price 元素。 | + | //title \| //price | 选取文档中的所有 title 和 price 元素。 | + | /bookstore/book/title \| //price | 选取属于 bookstore 元素的 book 元素的所有 title 元素,以及文档中所有的 price 元素。 | + +#### 在Chrome浏览器中查看元素XPath语法 + +![](./res/douban-xpath.png) diff --git a/Day66-75/Scrapy的应用03.md b/Day66-75/Scrapy的应用03.md deleted file mode 100644 index e69de29..0000000 diff --git a/Day66-75/Scrapy的应用02.md b/Day66-75/code/douban/douban/__init__.py similarity index 100% rename from Day66-75/Scrapy的应用02.md rename to Day66-75/code/douban/douban/__init__.py diff --git a/Day66-75/code/douban/douban/items.py b/Day66-75/code/douban/douban/items.py new file mode 100644 index 0000000..e7ce38c --- /dev/null +++ b/Day66-75/code/douban/douban/items.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DoubanItem(scrapy.Item): + + name = scrapy.Field() + year = scrapy.Field() + score = scrapy.Field() + director = scrapy.Field() + classification = scrapy.Field() + actor = scrapy.Field() diff --git a/Day66-75/code/douban/douban/middlewares.py b/Day66-75/code/douban/douban/middlewares.py new file mode 100644 index 0000000..55ec8b7 --- /dev/null +++ b/Day66-75/code/douban/douban/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class DoubanSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class DoubanDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/Day66-75/code/douban/douban/pipelines.py b/Day66-75/code/douban/douban/pipelines.py new file mode 100644 index 0000000..209e395 --- /dev/null +++ b/Day66-75/code/douban/douban/pipelines.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html +import pymongo + +from scrapy.exceptions import DropItem +from scrapy.conf import settings +from scrapy import log + + +class DoubanPipeline(object): + + def __init__(self): + connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) + db = connection[settings['MONGODB_DB']] + self.collection = db[settings['MONGODB_COLLECTION']] + + def process_item(self, item, spider): + #Remove invalid data + valid = True + for data in item: + if not data: + valid = False + raise DropItem("Missing %s of blogpost from %s" %(data, item['url'])) + if valid: + #Insert data into database + new_moive=[{ + "name":item['name'][0], + "year":item['year'][0], + "score":item['score'], + "director":item['director'], + "classification":item['classification'], + "actor":item['actor'] + }] + self.collection.insert(new_moive) + log.msg("Item wrote to MongoDB database %s/%s" % + (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), + level=log.DEBUG, spider=spider) + return item + diff --git a/Day66-75/code/douban/douban/settings.py b/Day66-75/code/douban/douban/settings.py new file mode 100644 index 0000000..98ff629 --- /dev/null +++ b/Day66-75/code/douban/douban/settings.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for douban project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'douban' + +SPIDER_MODULES = ['douban.spiders'] +NEWSPIDER_MODULE = 'douban.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 3 +RANDOMIZE_DOWNLOAD_DELAY = True +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = True + +MONGODB_SERVER = '120.77.222.217' +MONGODB_PORT = 27017 +MONGODB_DB = 'douban' +MONGODB_COLLECTION = 'movie' + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'douban.middlewares.DoubanSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'douban.middlewares.DoubanDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'douban.pipelines.DoubanPipeline': 400, +} + +LOG_LEVEL = 'DEBUG' + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/Day66-75/code/douban/douban/spiders/__init__.py b/Day66-75/code/douban/douban/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/Day66-75/code/douban/douban/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/Day66-75/code/douban/douban/spiders/movie.py b/Day66-75/code/douban/douban/spiders/movie.py new file mode 100644 index 0000000..4db744d --- /dev/null +++ b/Day66-75/code/douban/douban/spiders/movie.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.selector import Selector +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule + +from douban.items import DoubanItem + + +class MovieSpider(CrawlSpider): + name = 'movie' + allowed_domains = ['movie.douban.com'] + start_urls = ['https://movie.douban.com/top250'] + rules = ( + Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*'))), + Rule(LinkExtractor(allow=(r'https://movie.douban.com/subject/\d+')), callback='parse_item'), + ) + + def parse_item(self, response): + sel = Selector(response) + item = DoubanItem() + item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract() + item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)') + item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract() + item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract() + item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract() + item['actor']= sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract() + #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() + #i['name'] = response.xpath('//div[@id="name"]').extract() + #i['description'] = response.xpath('//div[@id="description"]').extract() + return item + diff --git a/Day66-75/code/douban/scrapy.cfg b/Day66-75/code/douban/scrapy.cfg new file mode 100644 index 0000000..f4cf5a2 --- /dev/null +++ b/Day66-75/code/douban/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = douban.settings + +[deploy] +#url = http://localhost:6800/ +project = douban diff --git a/Day66-75/code/example07.py b/Day66-75/code/example07.py new file mode 100644 index 0000000..ee7d9cc --- /dev/null +++ b/Day66-75/code/example07.py @@ -0,0 +1,37 @@ +import pymongo + + +# BSON - Binary JSON - dict +def main(): + # client = pymongo.MongoClient('mongodb://120.77.222.217:27017') + client = pymongo.MongoClient(host='120.77.222.217', port=27017) + db = client.zhihu + pages_cache = db.webpages + """ + pages_cache.insert_many([ + {'_id': 1, 'url': 'http://www.baidu.com', 'content': 'shit'}, + {'_id': 2, 'url': 'http://www.qq.com', 'content': 'another shit'}, + {'_id': 3, 'url': 'http://www.qfedu.com', 'content': 'biggest shit'} + ]) + + print(pages_cache.update({'_id': 5}, {'$set': {'content': 'hello, world!'}}, upsert=True)) + # page_id = pages_cache.insert_one({'url': 'http://www.baidu.com', 'content': ''}) + # print(page_id.inserted_id) + # print(pages_cache.remove({'url': 'http://www.baidu.com'})) + print(pages_cache.find().count()) + for doc in pages_cache.find().sort('_id'): + print(doc) + """ + pages_cache.insert_one({ + 'url': 'http://www.baidu.com', + 'content': 'bull shit!', + 'owner': { + 'name': 'Lee Yanhong', + 'age': 50, + 'idcard': '110220196804091203' + } + }) + + +if __name__ == '__main__': + main() diff --git a/Day66-75/code/example08.py b/Day66-75/code/example08.py new file mode 100644 index 0000000..d8a665d --- /dev/null +++ b/Day66-75/code/example08.py @@ -0,0 +1,28 @@ +import requests +from bs4 import BeautifulSoup + + +def main(): + resp = requests.get('https://github.com/login') + if resp.status_code != 200: + return + cookies = resp.cookies.get_dict() + print(cookies) + soup = BeautifulSoup(resp.text, 'lxml') + utf8_value = \ + soup.select_one('form input[name=utf8]').attrs['value'] + authenticity_token_value = \ + soup.select_one('form input[name=authenticity_token]').attrs['value'] + data = { + 'utf8': utf8_value, + 'authenticity_token': authenticity_token_value, + 'login': 'jackfrued@gmail.com', + 'password': 'yourpassword' + } + resp = requests.post('https://github.com/session', + data=data, cookies=cookies) + print(resp.text) + + +if __name__ == '__main__': + main() diff --git a/Day66-75/code/example09.py b/Day66-75/code/example09.py new file mode 100644 index 0000000..a852c46 --- /dev/null +++ b/Day66-75/code/example09.py @@ -0,0 +1,16 @@ +import robobrowser + + +def main(): + b = robobrowser.RoboBrowser(parser='lxml') + b.open('https://github.com/login') + f = b.get_form(action='/session') + f['login'].value = 'jackfrued@gmail.com' + f['password'].value = 'yourpassword' + b.submit_form(f) + for a_tag in b.select('a[href]'): + print(a_tag.attrs['href']) + + +if __name__ == '__main__': + main() diff --git a/Day66-75/code/example10.py b/Day66-75/code/example10.py index 3204c72..9e76321 100644 --- a/Day66-75/code/example10.py +++ b/Day66-75/code/example10.py @@ -1,33 +1,12 @@ -import requests -from bs4 import BeautifulSoup -# selenium是一个自动化测试工具 -# 通过它可以模拟浏览器的行为来访问Web页面 -from selenium import webdriver +import robobrowser def main(): - # 先下载chromedriver并且将可执行程序放到PATH环境变量路径下 - # 创建谷歌Chrome浏览器内核 - driver = webdriver.Chrome() - # 通过浏览器内核加载页面(可以加载动态生成的内容) - driver.get('https://www.taobao.com/markets/mm/mm2017') - # driver.page_source获得的页面包含了JavaScript动态创建的内容 - soup = BeautifulSoup(driver.page_source, 'lxml') - all_images = soup.select('img[src]') - for image in all_images: - url = image.get('src') - try: - if not str(url).startswith('http'): - url = 'http:' + url - filename = url[url.rfind('/') + 1:] - print(filename) - resp = requests.get(url) - with open('c:/images/' + filename, 'wb') as f: - f.write(resp.content) - except OSError: - print(filename + '下载失败!') - print('图片下载完成!') + b = robobrowser.RoboBrowser(parser='lxml') + b.open('https://v.taobao.com/v/content/live?catetype=704&from=taonvlang') + for img_tag in b.select('img[src]'): + print(img_tag.attrs['src']) if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/Day66-75/code/example11.py b/Day66-75/code/example11.py new file mode 100644 index 0000000..b3d783f --- /dev/null +++ b/Day66-75/code/example11.py @@ -0,0 +1,18 @@ +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.common.keys import Keys + + +def main(): + driver = webdriver.Chrome() + driver.get('https://v.taobao.com/v/content/live?catetype=704&from=taonvlang') + elem = driver.find_element_by_css_selector('input[placeholder=输入关键词搜索]') + elem.send_keys('运动') + elem.send_keys(Keys.ENTER) + soup = BeautifulSoup(driver.page_source, 'lxml') + for img_tag in soup.body.select('img[src]'): + print(img_tag.attrs['src']) + + +if __name__ == '__main__': + main() diff --git a/Day66-75/code/example12.py b/Day66-75/code/example12.py new file mode 100644 index 0000000..12024b8 --- /dev/null +++ b/Day66-75/code/example12.py @@ -0,0 +1,29 @@ +import base64 + +from PIL import Image, ImageFilter +from pytesseract import image_to_string + +import requests +from io import BytesIO + + +def main(): + guido_img = Image.open(open('guido.jpg', 'rb')) + guido2_img = guido_img.filter(ImageFilter.GaussianBlur) + guido2_img.save(open('guido2.jpg', 'wb')) + + img1 = Image.open(open('tesseract.png', 'rb')) + img2 = img1.point(lambda x: 0 if x < 128 else 255) + img2.save(open('tesseract2.png', 'wb')) + + print(image_to_string(img2)) + + resp = requests.get('https://pin2.aliyun.com/get_img?type=150_40&identity=mailsso.mxhichina.com&sessionid=k0xHyBxU3K3dGXb59mP9cdeTXxL9gLHSTKhRZCryHxpOoyk4lAVuJhgw==') + img3 = Image.open(BytesIO(resp.content)) + img3.save('captcha.jpg') + print(image_to_string(img3)) + print(base64.b64encode(resp.content)) + + +if __name__ == '__main__': + main() diff --git a/Day66-75/code/guido.jpg b/Day66-75/code/guido.jpg new file mode 100644 index 0000000..78f71ae Binary files /dev/null and b/Day66-75/code/guido.jpg differ diff --git a/Day66-75/code/tesseract.png b/Day66-75/code/tesseract.png new file mode 100644 index 0000000..315cf94 Binary files /dev/null and b/Day66-75/code/tesseract.png differ diff --git a/Day66-75/res/douban-xpath.png b/Day66-75/res/douban-xpath.png new file mode 100644 index 0000000..3ecb737 Binary files /dev/null and b/Day66-75/res/douban-xpath.png differ