diff --git a/biyingSpider.py b/biyingSpider.py index 86bf645..b5fae1e 100644 --- a/biyingSpider.py +++ b/biyingSpider.py @@ -5,10 +5,11 @@ local = time.strftime("%Y.%m.%d") url = 'http://cn.bing.com/' con = requests.get(url) content = con.text -reg = r"(http://s.cn.bing.net/az/hprichbg/rb/.*?.jpg)" +reg = r"(az/hprichbg/rb/.*?.jpg)" a = re.findall(reg, content, re.S)[0] print(a) -read = requests.get(a) +picUrl = url + a +read = requests.get(picUrl) f = open('%s.jpg' % local, 'wb') f.write(read.content) f.close() diff --git a/image_recognition_zhihu.py b/image_recognition_zhihu.py new file mode 100644 index 0000000..5dd0a53 --- /dev/null +++ b/image_recognition_zhihu.py @@ -0,0 +1,202 @@ +# -*- coding:UTF-8 -*- + +import requests , time ,random +import hmac ,json ,base64 +from bs4 import BeautifulSoup +from hashlib import sha1 +import TencentYoutuyun +from PIL import Image +import uuid + + + +def recognition_captcha(data): + ''' 识别验证码 ''' + + file_id = str(uuid.uuid1()) + filename = 'captcha_'+ file_id +'.gif' + filename_png = 'captcha_'+ file_id +'.png' + + if(data is None): + return + data = base64.b64decode(data.encode('utf-8')) + with open( filename ,'wb') as fb: + fb.write( data ) + + appid = 'appid' # 接入优图服务,注册账号获取 + secret_id = 'secret_id' + secret_key = 'secret_key' + userid= 'userid' + end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT + + youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化 + + # 拿到的是gif格式,而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式 + im = Image.open( filename) + im.save( filename_png ,"png") + im.close() + + result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径,1代表url + + return result + + +def get_captcha(sessiona,headers): + ''' 获取验证码 ''' + + need_cap = False + + while( need_cap is not True): + try: + sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf + resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket + need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 + time.sleep( 0.5 + random.randint(1,9)/10 ) + except Exception: + continue + + try: + resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据,注意是put + img_data = json.loads(resp3.text)["img_base64"] + except Exception: + return + + + return img_data + +def create_point( point_data, confidence ): + ''' 获得点阵 ''' + + # 实际操作下,套路不深,x间隔25,y相同,共7个点 ,先模拟意思一下 + points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]} + wi = 0 + input_points = [] + + for word in ( point_data['items'][0]['words'] ): + wi = wi+1 + if( word['confidence'] < confidence ): + try: + input_points.append(points[wi]) # 倒置的中文,优图识别不出来,置信度会低于0.5 + except KeyError: + continue + + if( len(input_points) > 2 or len(input_points) == 0 ): + return [] # 7个字中只有2个倒置中文的成功率高 + + result = {} + result['img_size']=[200,44] + result['input_points']=input_points + result = json.dumps(result) + print(result) + return result + +def bolting(k_low,k_hi,k3_confidence): + ''' 筛选把握大的进行验证 ''' + + start = time.time() + + is_success = False + while(is_success is not True): + + points_len = 1 + angle = -20 + img_ko = [] + + while(points_len != 21 or angle < k_low or angle > k_hi ): + img_data = get_captcha(sessiona,headers) + img_ko = recognition_captcha(img_data) + + ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False + # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) + # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因,也因为优图自身的原因,此处要特殊处理 + + # with open( "json.txt" ,'wb') as fb: + # fb.write( img_ko_json ) + + try: + points_len = len(img_ko['items'][0]['itemstring']) + angle = img_ko['angle'] + except Exception: + points_len = 1 + angle = -20 + continue + + # print(img_ko_json.decode('utf8')) ## stdout用的是utf8,需转码才能正常显示 + # print('-'*50) + + input_text = create_point( img_ko ,k3_confidence ) + if(type(input_text) == type([])): + continue + + data = { + "input_text":input_text + } + + # 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ,假装思考5秒钟 + time.sleep( 4 + random.randint(1,9)/10 ) + try: + resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers) + except Exception: + continue + + print("angle: "+ str(angle) ) + print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心 + print('-'*50) + try: + is_success = json.loads(resp5.text)["success"] + except KeyError: + continue + + end = time.time() + + return end-start + + +if __name__ == "__main__": + + sessiona = requests.Session() + headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} + + k3_confidence = 0.71 + + ''' + # 可视化数据会被保存在云端供浏览 + # https://plot.ly/~weldon2010/4 + # 纯属学习,并未看出"角度"范围扩大对图像识别的影响,大部分时候60s内能搞定,说明优图还是很强悍的,识别速度也非常快 + ''' + runtime_list_x = [] + runtime_list_y = [] + nn = range(1,11) # 愿意的话搞多线程,1百万次更有意思 + + # 成功尝试100次,形成2维数据以热力图的方式展示 + for y in nn : + for x in nn : + runtime_list_x.append( bolting(-3,3,k3_confidence) ) + print( "y: " + str(runtime_list_y) ) + print( "x: " + str(runtime_list_x) ) + runtime_list_y.append(runtime_list_x.copy()) + runtime_list_x = [] + + print ("-"*30) + print( runtime_list_y ) + print ("-"*30) + + # pip install plotly 数据可视化 + import plotly + import plotly.graph_objs as go + plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册 + trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ]) + data=[trace] + plotly.plotly.plot(data, filename='weldon-time2-heatmap') + + # 尝试后发现一个特点,基本都是1~2个倒置中文,这样我们可以借此提速 + # 角度范围放大,仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找 + +### chcp 65001 (win下改变cmd字符集) +### python c:\python34\image_recognition_zhihu.py + + + + + + diff --git a/login_zhihu.py b/login_zhihu.py index 2bf8933..7c5a601 100644 --- a/login_zhihu.py +++ b/login_zhihu.py @@ -1,27 +1,84 @@ -import requests,time +# -*- coding:UTF-8 -*- + +import requests , time +import hmac ,json from bs4 import BeautifulSoup -url = 'https://www.zhihu.com/login/email' -def get_captcha(data): +from hashlib import sha1 + + +def get_captcha(data,need_cap): + ''' 处理验证码 ''' + if need_cap is False: + return with open('captcha.gif','wb') as fb: fb.write(data) - return input('captcha') + return input('captcha:') + +def get_signature(grantType,clientId,source,timestamp): + ''' 处理签名 ''' + + hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1) + hm.update(str.encode(grantType)) + hm.update(str.encode(clientId)) + hm.update(str.encode(source)) + hm.update(str.encode(timestamp)) -def login(username,password,oncaptcha): - sessiona = requests.Session() - headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} - xyz = sessiona.get('https://www.zhihu.com/#signin',headers=headers).content - _xsrf = BeautifulSoup(sessiona.get('https://www.zhihu.com/#signin',headers=headers).content,'html.parser').find('input',attrs={'name':'_xsrf'}).get('value') + return str(hm.hexdigest()) + + + +def login(username,password,oncaptcha,sessiona,headers): + ''' 处理登录 ''' + + resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf + resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket + need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 + + grantType = 'password' + clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20' + source ='com.zhihu.web' + timestamp = str((time.time()*1000)).split('.')[0] # 签名只按这个时间戳变化 + captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content + data = { - "_xsrf":_xsrf, - "email":username, + "client_id":clientId, + "grant_type":grantType, + "timestamp":timestamp, + "source":source, + "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名 + "username":username, "password":password, - "remember_me":True, - "captcha":oncaptcha(captcha_content) + "lang":"cn", + "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码 + "ref_source":"other_", + "utm_source":"" } - resp = sessiona.post('https://www.zhihu.com/login/email',data,headers=headers).content - print(resp) + + print("**2**: "+str(data)) + print("-"*50) + resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content + print(BeautifulSoup(resp,'html.parser')) + + print("-"*50) return resp if __name__ == "__main__": - login('email','password',get_captcha) \ No newline at end of file + sessiona = requests.Session() + headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} + + login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了 + resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers) # 登录进去了,可以看私信了 + print(BeautifulSoup(resp.content ,'html.parser')) + + + + +### chcp 65001 (win下改变cmd字符集) +### python c:\python34\login_zhihu.py +### 有非常无语的事情发生,还以为代码没生效 + + + + +