python crawler-simulated microblog login

Microblog Simulated Logon

This is the website of this crawl: https://weibo.com/

I. Request Analysis

Find the login location, fill in the username password for login operation

Look at the data for this request response.

This is the data from the response. Save it.

exectime: 8
nonce: "HW9VSX"
pcid: "gz-4ede4c6269a09f5b7a6490f790b4aa944eec"
pubkey: "EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443"
retcode: 0
rsakv: "1330428213"
servertime: 1568257059

Continue to improve the login operation, enter the password, click the login button

After analysis, it is found that the parameters of change are sp, nonce and server time. Sertime is the current timestamp. We just need to find the other two parameter generation methods. For su, this parameter is generated through base64 encryption

2. Find the encryption method of sp, non CE

This time, you don't need to search keywords to find the encrypted location.

Find the location of the calling function, break the point, and log in.

After debugging and analysis of js code flow, we finally found the location of encryption.

A brief introduction to how to debug js code

Find the location of sp, non ce, and implement its encryption through python code

sp is generated by rsa encryption. The specific usage of rsa can be found in Baidu. Or it can be generated by sha1 encryption. As for me.rsaPubkey, how did he get it? He just sent a request before we clicked on the login, and the response data of that request had it. If you test more times, you will find that this value is fixed. So we can use it directly without requesting it.

Nonce: It also appeared in the data of the request response before the login was not clicked, but we clicked on the login several times and did not find the request. The value of nonce varies from time to time. So it must be generated by a function of the local js file without requesting the server to get it. Here we find the nonce generating function.

import random
import rsa
import hashlib
from binascii import b2a_hex

def get_nonce(n):
    result = ""
    random_str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    for i in range(n):
        index = random.randint(0, len(random_str) - 1)
        result += random_str[index]
    return result


def get_sp_rsa(password, servertime, nonce):
    key = "EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443"
    pubkey = rsa.PublicKey(int(key, 16), int("10001", 16))
    res = rsa.encrypt(bytes("" + "\t".join([servertime, nonce]) + "\n" + password,encoding="utf-8"), pubkey)
    return b2a_hex(res)


def get_sp_sha1(password, servertime, nonce):
    res = hashlib.sha1(bytes("" + hashlib.sha1(bytes(hashlib.sha1(bytes(password, encoding="utf-8")).hexdigest(),encoding="utf-8")).hexdigest() + servertime + nonce,encoding="utf-8")).hexdigest()
    return res

Response data

Request parameter analysis is almost complete, this time enter the correct username, password. Look at the data for the response.

Open fiddler, then exit the current account and re-login. There are many requests on fiddler. Find the required request and see the response content

In doing so, each response will set-cookie. So follow the above process, the cookie that identifies the login is sure to be available. Later, just bring this cookie to do something else.

Finally, attach the code.

import requests, random, time, rsa, hashlib, base64, re, json
from binascii import b2a_hex


class WeiBo:

    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
        }

    def login(self, account, password):
        api = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)"
        nonce = self._get_nonce()
        servertime = self._get_now_time()
        sp = self._get_sp_rsa(password, servertime, nonce)
        su = self._get_su(account)
        data = {
            "entry": "weibo",
            "gateway": "1",
            "from": "",
            "savestate": "7",
            "qrcode_flag": "false",
            "useticket": "1",
            "pagerefer": "https://login.sina.com.cn/crossdomain2.php?action=logout&r=https%3A%2F%2Fpassport.weibo.com%2Fwbsso%2Flogout%3Fr%3Dhttps%253A%252F%252Fweibo.com%26returntype%3D1",
            "vsnf": "1",
            "su": su,
            "service": "miniblog",
            "servertime": servertime,
            "nonce": nonce,
            "pwencode": "rsa2",
            "rsakv": "1330428213",
            "sp": sp,
            "sr": "1920*1080",
            "encoding": "UTF - 8",
            "prelt": "149",
            "url": "https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack",
            "returntype": "META",
        }
        headers = self.headers.copy()
        headers.update({
            "Host": "login.sina.com.cn",
            "Origin": "https://weibo.com",
            "Referer": "https://weibo.com/"
        })

        response = self.session.post(api, headers=headers, data=data, allow_redirects=False)
        search_result = self._re_search("location.replace\(\"(.*?)\"", response.text)
        redirct_url = search_result and search_result.group(1)
        if not redirct_url:
            raise Exception("redirect url Acquisition failure")
        response = self.session.get(redirct_url, headers=headers.update({
            "Referer": "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)"
        }), allow_redirects=False)
        search_result = self._re_search('"arrURL":(.*?)}', response.text)
        redirct_urls = search_result and search_result.group(1)
        if not redirct_urls:
            raise Exception("redirect url Acquisition failure")
        redirct_url_list = json.loads(redirct_urls)
        userId = ""
        for url in redirct_url_list:
            response = self.session.get(url, headers=self.headers)
            if url.startswith("https://passport.weibo.com/wbsso/login"):
                userId = self._re_search('"uniqueid":"(.*?)"', response.text).group(1)
        if not userId:
            raise Exception("userId Acquisition failure")
        user_details_url = "https://weibo.com/u/{}/home?wvr=5&lf=reg".format(userId)
        response = self.session.get(user_details_url, headers={
            "Referer": "https://weibo.com/",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
        })
        if self._re_search(userId, response.text):
            print("Login successfully")
            print(self.session.cookies)
        else:
            print("Login failed")

    def _get_nonce(self):
        nonce = ""
        random_str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        for i in range(5):
            index = random.randint(0, len(random_str) - 1)
            nonce += random_str[index]
        return nonce

    def _get_now_time(self):
        return str(int(time.time()))

    def _get_sp_rsa(self, password, servertime, nonce):
        key = "EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443"
        pubkey = rsa.PublicKey(int(key, 16), int("10001", 16))
        res = rsa.encrypt(bytes("" + "\t".join([servertime, nonce]) + "\n" + password, encoding="utf-8"), pubkey)
        return b2a_hex(res)

    def _get_sp_sha1(self, password, servertime, nonce):
        res = hashlib.sha1(bytes("" + hashlib.sha1(bytes(hashlib.sha1(bytes(password, encoding="utf-8")).hexdigest(),
                                                         encoding="utf-8")).hexdigest() + servertime + nonce,
                                 encoding="utf-8")).hexdigest()
        return res

    def _get_su(self, account):
        return str(base64.b64encode(bytes(account, encoding="utf-8")), encoding="utf-8")

    def _re_search(self, pattern, html):
        return re.search(pattern, html, re.S)

    def test(self):
        self.login("18716758777", "123456")


if __name__ == '__main__':
    wb = WeiBo()
    wb.test()

Keywords: Python encoding SHA1 Session PHP

Added by freakus_maximus on Thu, 12 Sep 2019 10:43:47 +0300