python source code of onenote

#!/usr/bin/env python3

import mimetypes
import os
import re
import sqlite3
from collections import namedtuple, OrderedDict
from datetime import datetime, timedelta, timezone
from sys import platform
from urllib.parse import urlencode
from zipfile import ZipFile

import requests
from bs4 import BeautifulSoup
from requests import HTTPError
from requests.auth import AuthBase

CLIENT_ID = 'ddbd41a5-c46d-44f2-85b2-d73bbd7bee7d'
CLIENT_SECRET = 'qqgu6ApNZyUmvYgna2WBwK5'
REDIRECT_URI = 'https://login.live.com/oauth20_desktop.srf'

LINUX_DATA_PATH = os.path.expanduser('~/.wiznote/{}/data')
API_BASE = 'https://www.onenote.com/api/v1.0/me/notes'
AUTH_URL = 'https://login.live.com/oauth20_authorize.srf?' + urlencode({
    'response_type': 'code',
    'client_id': CLIENT_ID,
    'redirect_uri': REDIRECT_URI,
    'scope': 'wl.signin office.onenote_create'
})
OAUTH_URL = 'https://login.live.com/oauth20_token.srf'

Document = namedtuple('Document', ['guid', 'title', 'location', 'name', 'url', 'created'])


class BearerAuth(AuthBase):
    def __init__(self, token):
        self.token = token

    def __call__(self, r):
        r.headers['Authorization'] = 'Bearer ' + self.token
        return r


def get_data_dir():
    while True:
        if platform == 'linux':
            account = input('Input WizNote account: ')
            data_path = LINUX_DATA_PATH.format(account)
        elif platform == 'win32' or platform == 'darwin':
            data_path = input('Input WizNote dir path of "index.db": ')
        else:
            raise Exception('Unsupported platform')

        index_path = os.path.join(data_path, 'index.db')
        if not os.path.isfile(index_path):
            print('Account data not found!')
            continue

        break

    return data_path, index_path


def get_doc_path(data_path, doc):
    if platform == 'linux' or platform == 'darwin':
        return os.path.join(data_path, 'notes', '{%s}' % doc.guid)

    if platform == 'win32':
        return os.path.join(data_path, doc.location.strip('/'), doc.name)

    raise Exception('Unsupported platform')


def get_token(session):
    print('Sign in via: ')
    print(AUTH_URL)
    print('Copy the url of blank page after signed in,\n'
          'url should starts with "https://login.live.com/oauth20_desktop.srf"')

    while True:
        url = input('URL: ')

        match = re.match(r'https://login.live.com/oauth20_desktop\.srf\?code=([\w-]{37})', url)
        if not match:
            print('Invalid URL!')
            continue

        code = match.group(1)
        break

    resp = session.post(OAUTH_URL, data={
        'grant_type': 'authorization_code',
        'client_id': CLIENT_ID,
        'client_secret': '',
        'code': code,
        'redirect_uri': REDIRECT_URI,
    }).json()
    
    print('Token result:', resp)

    return resp['access_token']


def create_notebook(session):
    name = input('Pleas input new notebook name(such as WizNote, can\'t be empty).\nName: ')
    print('Creating notebook: "%s"' % name)
    resp = session.post(API_BASE + '/notebooks', json={'name': name})
    resp.raise_for_status()

    return resp.json()['id']


def create_section(session, notebook_id, name):
    print('Creating section: "%s"' % name)
    resp = session.post(API_BASE + '/notebooks/%s/sections' % notebook_id, json={'name': name})
    resp.raise_for_status()

    return resp.json()['id']


def get_documents():
    data_path, index_path = get_data_dir()

    result = OrderedDict()
    with sqlite3.connect(index_path) as conn:
        sql = """
        SELECT DOCUMENT_GUID, DOCUMENT_TITLE, DOCUMENT_LOCATION, DOCUMENT_NAME, DOCUMENT_URL, DT_CREATED,
          DOCUMENT_PROTECT, DOCUMENT_ATTACHEMENT_COUNT
        FROM wiz_document
        ORDER BY DOCUMENT_LOCATION;
        """.strip()

        cur = conn.execute(sql)
        while True:
            row = cur.fetchone()
            if not row:
                break

            guid, title, location, name, url, created, protect, attachment_count = row

            if protect:
                print('Ignore protected document "%s"' % (location + title))
                continue

            if attachment_count:
                print('Ignore %d attachment(s) in "%s"' % (attachment_count, location + title))

            docs = result.get(location)
            if not docs:
                docs = []
                result[location] = docs

            doc = Document(guid, title, location, name, url, created)
            docs.append(doc)

    return data_path, result


def clean_html(data, doc):
    def parse_datetime(time_str):
        time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
        return time.replace(tzinfo=timezone(timedelta(hours=-8))).isoformat()

    soup = BeautifulSoup(data, 'lxml')

    pattern = re.compile(r'^index_files/(.+?)$')
    imgs = soup.find_all('img', src=pattern)

    src_file_names = []
    for img in imgs:
        src = img['src']
        match = pattern.fullmatch(src)
        file_name = match.group(1)

        src_file_names.append(file_name)
        img['src'] = 'name:' + file_name

    head_tag = soup.head
    if not head_tag:
        head_tag = soup.new_tag('head')
        soup.html.insert(0, head_tag)

    title_tag = head_tag.title
    if not title_tag:
        title_tag = soup.new_tag('title')

    # to avoid 'Untitled' in title
    title_tag.string = doc.title
    head_tag.insert(0, title_tag)

    created_tag = soup.new_tag('meta', attrs={'name': 'created', 'content': parse_datetime(doc.created)})
    head_tag.insert(1, created_tag)

    if doc.url:
        url_tag = soup.new_tag('p')
        url_tag.string = 'URL: ' + doc.url
        soup.body.insert(0, url_tag)

    return soup.encode('utf-8'), src_file_names


def upload_doc(session, section_id, data_path, doc):
    doc_path = get_doc_path(data_path, doc)

    print('Processing %s%s (%s)' % (doc.location, doc.title, doc.guid))

    with ZipFile(doc_path) as zip_file:
        html_content, src_file_names = clean_html(zip_file.read('index.html'), doc)

        if len(src_file_names) > 5:
            print('Upload may failed if images more than 5')

        data_send = {
            'Presentation': (None, html_content, mimetypes.guess_type('index.html')[0])
        }

        for name in src_file_names:
            data_send[name] = (None, zip_file.read('index_files/' + name), mimetypes.guess_type(name)[0])

    resp = session.post(API_BASE + '/sections/%s/pages' % section_id, files=data_send)
    resp.raise_for_status()


def main():
    data_dir, docs = get_documents()

    with requests.session() as session:
        token = get_token(session)
        session.auth = BearerAuth(token)

        notebook_id = create_notebook(session)

        for location, docs in docs.items():
            section_name = location.strip('/').replace('/', '-')
            section_id = create_section(session, notebook_id, section_name)

            for doc in docs:
                upload_doc(session, section_id, data_dir, doc)


if __name__ == '__main__':
    try:
        main()
    except HTTPError as e:
        print(e.response.json())