Merge branch 'master' of https://github.com/injetlee/Python

2018-07-26 18:24:45 +08:00 · 2018-07-26 18:24:45 +08:00 · 07c1113f91
parent 6ae2297311 9176b0a872
commit 07c1113f91
3 changed files with 278 additions and 18 deletions
--- a/biyingSpider.py
+++ b/biyingSpider.py
@ -5,10 +5,11 @@ local = time.strftime("%Y.%m.%d")
 url = 'http://cn.bing.com/'
 con = requests.get(url)
 content = con.text
-reg = r"(http://s.cn.bing.net/az/hprichbg/rb/.*?.jpg)"
+reg = r"(az/hprichbg/rb/.*?.jpg)"
 a = re.findall(reg, content, re.S)[0]
 print(a)
-read = requests.get(a)
+picUrl = url + a
+read = requests.get(picUrl)
 f = open('%s.jpg' % local, 'wb')
 f.write(read.content)
 f.close()
--- a/image_recognition_zhihu.py
+++ b/image_recognition_zhihu.py
@ -0,0 +1,202 @@
+# -*- coding:UTF-8 -*-
+
+import  requests , time ,random
+import  hmac ,json ,base64
+from bs4 import BeautifulSoup
+from hashlib import sha1
+import TencentYoutuyun
+from PIL import Image
+import uuid
+
+
+    
+def recognition_captcha(data):
+    ''' 识别验证码 '''
+
+    file_id = str(uuid.uuid1())
+    filename = 'captcha_'+ file_id +'.gif'
+    filename_png =  'captcha_'+ file_id +'.png'
+
+    if(data is None):
+        return 
+    data = base64.b64decode(data.encode('utf-8'))
+    with open( filename ,'wb') as fb:
+        fb.write( data )    
+    
+    appid = 'appid' # 接入优图服务，注册账号获取 
+    secret_id = 'secret_id'  
+    secret_key = 'secret_key'  
+    userid= 'userid' 
+    end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT   
+
+    youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化
+
+    # 拿到的是gif格式，而优图只支持 JPG PNG BMP 其中之一，这时我们需要 pip install Pillow 来转换格式
+    im = Image.open( filename)
+    im.save( filename_png ,"png")
+    im.close()
+    
+    result = youtu.generalocr( filename_png , data_type = 0 , seq = '')  #  0代表本地路径，1代表url
+
+    return result
+
+
+def get_captcha(sessiona,headers):
+    ''' 获取验证码 '''
+    
+    need_cap = False
+
+    while( need_cap is not True):
+        try:
+            sessiona.get('https://www.zhihu.com/signin',headers=headers)  # 拿cookie:_xsrf
+            resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers)  # 拿cookie:capsion_ticket 
+            need_cap = json.loads(resp2.text)["show_captcha"]  # {"show_captcha":false} 表示不用验证码
+            time.sleep( 0.5 + random.randint(1,9)/10 )
+        except Exception:
+            continue
+
+    try:
+        resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据，注意是put
+        img_data = json.loads(resp3.text)["img_base64"]
+    except Exception:
+        return     
+    
+
+    return img_data
+
+def create_point( point_data, confidence ):
+    ''' 获得点阵 '''
+
+    # 实际操作下，套路不深，x间隔25，y相同，共7个点 ，先模拟意思一下
+    points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]}
+    wi = 0
+    input_points = []
+    
+    for word in ( point_data['items'][0]['words'] ):
+        wi = wi+1
+        if( word['confidence'] < confidence ):
+            try:
+                input_points.append(points[wi]) # 倒置的中文，优图识别不出来，置信度会低于0.5
+            except KeyError:
+                continue
+        
+    if( len(input_points) > 2 or len(input_points) == 0 ):
+        return []  # 7个字中只有2个倒置中文的成功率高
+    
+    result = {}
+    result['img_size']=[200,44]
+    result['input_points']=input_points
+    result = json.dumps(result)
+    print(result)
+    return result
+
+def bolting(k_low,k_hi,k3_confidence):
+    ''' 筛选把握大的进行验证 '''
+
+    start = time.time()
+    
+    is_success = False
+    while(is_success is not True):
+    
+        points_len = 1
+        angle = -20
+        img_ko = []
+
+        while(points_len != 21  or  angle < k_low  or angle > k_hi ):  
+            img_data = get_captcha(sessiona,headers)
+            img_ko = recognition_captcha(img_data)
+     
+            ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
+            # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) 
+            # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因，也因为优图自身的原因，此处要特殊处理
+        
+            # with open( "json.txt" ,'wb') as fb:
+            #     fb.write( img_ko_json )  
+    
+            try:
+                points_len = len(img_ko['items'][0]['itemstring'])
+                angle = img_ko['angle']
+            except Exception:
+                points_len = 1
+                angle = -20
+                continue
+
+        # print(img_ko_json.decode('utf8')) ## stdout用的是utf8，需转码才能正常显示
+        # print('-'*50)
+        
+        input_text = create_point( img_ko ,k3_confidence )
+        if(type(input_text) == type([])):
+            continue
+        
+        data = {
+            "input_text":input_text   
+            }
+
+        # 提交过快会被拒绝，{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ，假装思考5秒钟
+        time.sleep( 4 + random.randint(1,9)/10 )
+        try:    
+            resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers)
+        except Exception:
+            continue
+        
+        print("angle: "+ str(angle) )
+        print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功，会回应{"success":true}，开心
+        print('-'*50)
+        try:
+            is_success = json.loads(resp5.text)["success"]
+        except KeyError:
+            continue
+
+    end = time.time()
+
+    return end-start
+
+
+if __name__ == "__main__":
+    
+    sessiona = requests.Session()
+    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
+
+    k3_confidence = 0.71
+    
+    '''
+    # 可视化数据会被保存在云端供浏览
+    # https://plot.ly/~weldon2010/4
+    # 纯属学习，并未看出"角度"范围扩大对图像识别的影响，大部分时候60s内能搞定，说明优图还是很强悍的，识别速度也非常快
+    '''
+    runtime_list_x = []
+    runtime_list_y = []
+    nn = range(1,11) # 愿意的话搞多线程，1百万次更有意思
+    
+    # 成功尝试100次，形成2维数据以热力图的方式展示
+    for y in nn :
+        for x in  nn :
+            runtime_list_x.append( bolting(-3,3,k3_confidence) )
+            print( "y: " + str(runtime_list_y) )
+            print( "x: " + str(runtime_list_x) )
+        runtime_list_y.append(runtime_list_x.copy())
+        runtime_list_x = []
+
+    print ("-"*30)    
+    print( runtime_list_y )
+    print ("-"*30)
+
+    # pip install plotly 数据可视化
+    import plotly
+    import plotly.graph_objs as go
+    plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号，去官网注册
+    trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ])
+    data=[trace]
+    plotly.plotly.plot(data, filename='weldon-time2-heatmap')    
+   
+    # 尝试后发现一个特点，基本都是1~2个倒置中文，这样我们可以借此提速
+    # 角度范围放大，仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找
+
+### chcp 65001 (win下改变cmd字符集)
+### python  c:\python34\image_recognition_zhihu.py
+
+
+
+
+
+
--- a/login_zhihu.py
+++ b/login_zhihu.py
@ -1,27 +1,84 @@
-import requests,time
+# -*- coding:UTF-8 -*-
+
+import  requests , time
+import  hmac ,json
 from bs4 import BeautifulSoup
-url = 'https://www.zhihu.com/login/email'
-def get_captcha(data):
+from hashlib import sha1
+
+
+def get_captcha(data,need_cap):
+    ''' 处理验证码 '''
+    if need_cap is False:
+        return
    with open('captcha.gif','wb') as fb:
        fb.write(data)
-    return input('captcha')
+    return input('captcha:')
+    
+def get_signature(grantType,clientId,source,timestamp):
+    ''' 处理签名 '''
+	
+    hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1)
+    hm.update(str.encode(grantType))
+    hm.update(str.encode(clientId))
+    hm.update(str.encode(source))
+    hm.update(str.encode(timestamp))

-def login(username,password,oncaptcha):
-    sessiona = requests.Session()
-    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}
-    xyz = sessiona.get('https://www.zhihu.com/#signin',headers=headers).content
-    _xsrf = BeautifulSoup(sessiona.get('https://www.zhihu.com/#signin',headers=headers).content,'html.parser').find('input',attrs={'name':'_xsrf'}).get('value')
+    return  str(hm.hexdigest())
+
+
+
+def login(username,password,oncaptcha,sessiona,headers):
+    ''' 处理登录 '''
+    
+    resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers)  # 拿cookie:_xsrf
+    resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers)  # 拿cookie:capsion_ticket 
+    need_cap = json.loads(resp2.text)["show_captcha"]  # {"show_captcha":false} 表示不用验证码
+
+    grantType = 'password'
+    clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
+    source ='com.zhihu.web'
+    timestamp = str((time.time()*1000)).split('.')[0]  # 签名只按这个时间戳变化
+       
    captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content
+    
    data = {
-        "_xsrf":_xsrf,
-        "email":username,
+        "client_id":clientId,
+        "grant_type":grantType,
+        "timestamp":timestamp,
+        "source":source,
+        "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名
+        "username":username,
        "password":password,
-        "remember_me":True,
-        "captcha":oncaptcha(captcha_content)
+        "lang":"cn",
+        "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码
+        "ref_source":"other_",
+        "utm_source":""
    }
-    resp = sessiona.post('https://www.zhihu.com/login/email',data,headers=headers).content
-    print(resp)
+    
+    print("**2**: "+str(data))
+    print("-"*50)
+    resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content
+    print(BeautifulSoup(resp,'html.parser'))
+    
+    print("-"*50)
    return resp 

 if __name__ == "__main__":
-    login('email','password',get_captcha)
+    sessiona = requests.Session()
+    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
+
+    login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了
+    resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers)  # 登录进去了，可以看私信了
+    print(BeautifulSoup(resp.content ,'html.parser'))
+    
+    
+    
+    
+### chcp 65001 (win下改变cmd字符集)
+### python  c:\python34\login_zhihu.py
+### 有非常无语的事情发生，还以为代码没生效
+
+
+
+
+