Skip to content
genpytable.py 13.2 KiB
Newer Older
# vim:set et sts=4:
# -*- coding: utf-8 -*-

from pydict import *
from bopomofo import *

def str_cmp(a, b):
    if len(a) == len(b):
        return cmp(a, b)
    else:
        return len(a) - len(b)

pinyin_list = PINYIN_DICT.keys()
pinyin_list.sort()

shengmu_list = SHENGMU_DICT.keys()
shengmu_list.remove("")
shengmu_list.sort()

auto_correct = [
    # "correct", "wrong"
    ("ng", "gn"),
    ("ng", "mg"),
    ("iu", "iou"),
    ("ui", "uei"),
    ("un", "uen"),
#    ("ue", "ve"),
    ("ve", "ue"),
    ("ong", "on"),
]

auto_correct_ext = [
    # "correct", "wrong", flag
    ("ju", "jv", "PINYIN_CORRECT_V_TO_U"),
    ("qu", "qv", "PINYIN_CORRECT_V_TO_U"),
    ("xu", "xv", "PINYIN_CORRECT_V_TO_U"),
    ("yu", "yv", "PINYIN_CORRECT_V_TO_U"),

    ("jue", "jve", "PINYIN_CORRECT_V_TO_U"),
    ("que", "qve", "PINYIN_CORRECT_V_TO_U"),
    ("xue", "xve", "PINYIN_CORRECT_V_TO_U"),
    ("yue", "yve", "PINYIN_CORRECT_V_TO_U"),

    ("juan", "jvan", "PINYIN_CORRECT_V_TO_U"),
    ("quan", "qvan", "PINYIN_CORRECT_V_TO_U"),
    ("xuan", "xvan", "PINYIN_CORRECT_V_TO_U"),
    ("yuan", "yvan", "PINYIN_CORRECT_V_TO_U"),

    ("jun", "jvn", "PINYIN_CORRECT_V_TO_U"),
    ("qun", "qvn", "PINYIN_CORRECT_V_TO_U"),
    ("xun", "xvn", "PINYIN_CORRECT_V_TO_U"),
    ("yun", "yvn", "PINYIN_CORRECT_V_TO_U"),

    ("juang", "jvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"),
    ("quang", "qvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"),
    ("xuang", "xvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"),
    ("yuang", "yvang", "PINYIN_FUZZY_UANG_UAN | PINYIN_CORRECT_V_TO_U"),

    ("jun", "jven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"),
    ("qun", "qven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"),
    ("xun", "xven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"),
    ("yun", "yven", "PINYIN_CORRECT_UEN_TO_UN | PINYIN_CORRECT_V_TO_U"),
]

fuzzy_shengmu = [
    ("c", "ch"),
    ("ch", "c"),
    ("z", "zh"),
    ("zh", "z"),
    ("s", "sh"),
    ("sh", "s"),
    ("l", "n"),
    ("n", "l"),
    ("f", "h"),
    ("h", "f"),
    ("l", "r"),
    ("r", "l"),
    ("k", "g"),
    ("g", "k"),
]

fuzzy_yunmu = [
    ("an", "ang"),
    ("ang", "an"),
    ("en", "eng"),
    ("eng", "en"),
    ("in", "ing"),
    ("ing", "in"),
    ("ian", "iang"),
    ("iang", "ian"),
    ("uan", "uang"),
    ("uang", "uan"),
]

def get_sheng_yun(pinyin):
    if pinyin == None:
        return None, None
    if pinyin == "ng":
        return "", "ng"
    for i in range(2, 0, -1):
        s = pinyin[:i]
        if s in shengmu_list:
            return s, pinyin[i:]
    return "", pinyin

yunmu_list = set([])
for p in pinyin_list:
    s, y = get_sheng_yun(p)
    yunmu_list |= set([y])
yunmu_list = list(yunmu_list)
yunmu_list.sort()

shengmu_yunmu_list = shengmu_list + yunmu_list
id_dict = {}
for i, y in enumerate(shengmu_yunmu_list):
    id_dict[y] = i + 1

fuzzy_shengmu_dict = {}
for s1, s2 in fuzzy_shengmu:
    if s1 not in fuzzy_shengmu_dict:
        fuzzy_shengmu_dict[s1] = []
    fuzzy_shengmu_dict[s1].append(s2)

fuzzy_yunmu_dict = {}
for y1, y2 in fuzzy_yunmu:
    if y1 not in fuzzy_yunmu_dict:
        fuzzy_yunmu_dict[y1] = []
    fuzzy_yunmu_dict[y1].append(y2)

def encode_pinyin(pinyin):
    if pinyin == None or pinyin == "":
        return 0
    return id_dict[pinyin]

    e = 0
    for c in pinyin:
        e = (e << 5) + (ord(c) - ord('a') + 1)
    return e

def get_pinyin():
    for p in pinyin_list:
        s, y = get_sheng_yun(p)
        yield p, s, y, len(p), []

    for s in shengmu_list:
        yield s, s, "", len(s), ["PINYIN_INCOMPLETE_PINYIN"]

    for c, w in auto_correct:
        flag = "PINYIN_CORRECT_%s_TO_%s" % (w.upper(), c.upper())
        for p in pinyin_list:
            if p.endswith(c) and p != c:
                wp = p.replace(c, w)
                s, y = get_sheng_yun(p)
                yield wp, s, y, len(wp), [flag]

    for c, w, flag in auto_correct_ext:
        s, y = get_sheng_yun(c)
        yield w, s, y, len(w), [flag]

    for s1, s2 in fuzzy_shengmu:
        flag = "PINYIN_FUZZY_%s_%s" % (s1.upper(), s2.upper())
        for y in  yunmu_list:
            if s1 + y not in pinyin_list and s2 + y in pinyin_list:
                yield s1 + y, s1, y, len(s1) + len(y),  [flag]
        for c, w in auto_correct:
            if s1 + w not in pinyin_list and \
               s1 + c not in pinyin_list and \
               s2 + w not in pinyin_list and \
               s2 + c in pinyin_list:
                flag_correct = "PINYIN_CORRECT_%s_TO_%s" % (w.upper(), c.upper())
                yield s1 + w, s1, c, len(s2) + len(w), ["%s | %s" % (flag, flag_correct)]

            # if s2 + y not in pinyin_list and s1 + y in pinyin_list:
            #     yield s2 + y, s2, y, len (s2) + len(y), [flag]

    for y1, y2 in fuzzy_yunmu:
        flag = "PINYIN_FUZZY_%s_%s" % (y1.upper(), y2.upper())
        for s in shengmu_list:
            if s + y1 not in pinyin_list and s + y2 in pinyin_list:
                yield s + y1, s, y1, len(s) + len(y1), [flag]
            # if s + y2 not in pinyin_list and s + y1 in pinyin_list:
            #     yield s + y2, s, y2, len(s) + len(y2), [flag]


def get_pinyin_with_fuzzy():
    for text, s, y, l, flags in get_pinyin():
        fss = fuzzy_shengmu_dict.get(s, ["", ""])
        fys = fuzzy_yunmu_dict.get(y, ["", ""])

        try:
            fs1, fs2 = fss
        except:
            fs1, fs2 = fss[0], ""

        try:
            fy1, fy2 = fys
        except:
            fy1, fy2 = fys[0], ""

        if fs1 and \
            (fs1 + y not in pinyin_list) and \
            (fy1 and fs1 + fy1 not in pinyin_list) and \
            (fy2 and fs1 + fy2 not in pinyin_list):
            fs1 = ""

        if fs2 and \
            (fs2 + y not in pinyin_list) and \
            (fy1 and fs2 + fy1 not in pinyin_list) and \
            (fy2 and fs2 + fy2 not in pinyin_list):
            fs2 = ""

        if fy1 and \
            (s + fy1 not in pinyin_list) and \
            (fs1 and fs1 + fy1 not in pinyin_list) and \
            (fs2 and fs2 + fy1 not in pinyin_list):
            fy1 = ""

        if fy2 and \
            (s + fy2 not in pinyin_list) and \
            (fs1 and fs1 + fy2 not in pinyin_list) and \
            (fs2 and fs2 + fy2 not in pinyin_list):
            fy2 = ""

        bopomofo = pinyin_bopomofo_map.get(text, "")

        if bopomofo == "":
            if all([f.startswith("PINYIN_FUZZY_") for f in flags[0].split(" | ")]):
                #if it is fuzzy pinyin or normal pinyin
                if s in sheng_yun_bopomofo_map and y in sheng_yun_bopomofo_map:
                    if isinstance(sheng_yun_bopomofo_map[s], str):
                        bopomofo = sheng_yun_bopomofo_map[s]
                    else:
                        if y in sheng_yun_bopomofo_map[s][1][0]:
                            bopomofo += sheng_yun_bopomofo_map[s][1][1]
                        else:
                            bopomofo += sheng_yun_bopomofo_map[s][0]

                    if isinstance(sheng_yun_bopomofo_map[y], str):
                        bopomofo += sheng_yun_bopomofo_map[y]
                    else:
                        if s in sheng_yun_bopomofo_map[y][1][0]:
                            bopomofo += sheng_yun_bopomofo_map[y][1][1]
                        else:
                            bopomofo += sheng_yun_bopomofo_map[y][0]
                else:
                    print text

        yield text, bopomofo, s, y, fs1, fy1, fs2, fy2, l, flags


def gen_header():
    print '''/* Please do not modify this file. It is generated by script */
#include "Types.h"

namespace PY {
'''

def gen_macros():
    print '#define PINYIN_ID_VOID    (-1)'
    print '#define PINYIN_ID_ZERO    (0)'
    for y in shengmu_list:
        print '#define PINYIN_ID_%s    (%d)' % (y.upper(), encode_pinyin(y))

    for y in yunmu_list:
        print '#define PINYIN_ID_%s    (%d)' % (y.upper(), encode_pinyin(y))
    print
    print
    print

def gen_option_check(name, fuzzy):
    print '''static bool
%s (unsigned int option, int id, int fid)
{
    switch ((id << 16) | fid) {''' % name
    for y1, y2 in fuzzy:
        flag = "PINYIN_FUZZY_%s_%s" % (y1.upper(), y2.upper())
        args = tuple(["PINYIN_ID_%s" % y.upper() for y in [y1, y2]]) + (flag, )
        print '''    case (%s << 16) | %s:
        return (option & %s);''' % args

    print '    default: return FALSE;'
    print '    }'
    print '}'

def union_dups(a):
    n = {}
    for r in a:
        if r[:-1] in n:
            n[r[:-1]] += r[-1]
        else:
            n[r[:-1]] = r[-1]
    na = []
    for k, flags in n.items():
        na.append (tuple(list(k) + [" | ".join(flags) if flags else "0"]))
    na.sort()
    return na

def gen_table():

    pinyins = list(get_pinyin_with_fuzzy())
    pinyins = union_dups(pinyins)

    print 'static const Pinyin pinyin_table[] = {'
    for i, (text, bopomofo, s, y, fs1, fy1, fs2, fy2, l, flags)  in enumerate(pinyins):
        s_id = "PINYIN_ID_%s" % s.upper() if s else "PINYIN_ID_ZERO"
        y_id = "PINYIN_ID_%s" % y.upper() if y else "PINYIN_ID_ZERO"
        fs1_id = "PINYIN_ID_%s" % fs1.upper() if fs1 else "PINYIN_ID_ZERO"
        fy1_id = "PINYIN_ID_%s" % fy1.upper() if fy1 else "PINYIN_ID_ZERO"
        fs2_id = "PINYIN_ID_%s" % fs2.upper() if fs2 else "PINYIN_ID_ZERO"
        fy2_id = "PINYIN_ID_%s" % fy2.upper() if fy2 else "PINYIN_ID_ZERO"

        # args = (i, ) + tuple(['"%s"' % s for s in p[:3]]) + tuple(["PINYIN_ID_%s" % s.upper() if s else "PINYIN_ID_ZERO" for s in p[3:9]]) + p[9:-1] + (str(p[-1]), )
        print '''    {  /* %d */
        text        : "%s",
        bopomofo    : L"%s",
        sheng       : "%s",
        yun         : "%s",
        pinyin_id   : {{ %s, %s }, { %s, %s }, { %s, %s }},
        len         : %d,
        flags       : %s
    },''' % (i, text, bopomofo, s, y.replace("v", "ü"), s_id, y_id, fs1_id, fy1_id, fs2_id, fy2_id, l, flags)

    print '};'
    print

    return pinyins

def gen_bopomofo_table(pinyins):
    bopomofo_table = [ (i, p) for i, p in enumerate(pinyins)]
    bopomofo_table.sort(lambda a, b: cmp(a[1][1], b[1][1]))
    print 'static const Pinyin *bopomofo_table[] = {'
    for i, p in bopomofo_table:
        if p[1]:
            print '    %-20s %s' % ('&pinyin_table[%d],' % i, '// "%s" => "%s"' % (p[1], p[0]))
    print '};'
    print

def get_all_special(pinyins):
    for p in pinyins:
        if p[-1] in ["n", "g", "r"]:
            for yun in yunmu_list:
                if yun not in pinyin_list:
                    continue
                new_pinyin = p[-1] + yun
                # if new_pinyin in pinyin_list:
                yield p, yun, p[:-1], new_pinyin
        elif p[-1] in ["e"]:
            yield p, "r", p[:-1], "er"

def get_max_freq_2(db, p1, p2):
    s1, y1 = get_sheng_yun(p1)
    s2, y2 = get_sheng_yun(p2)

    sql = "select max(freq), phrase from py_phrase_1 where s0 = %d and y0 = %d and s1 = %d and y1 = %d"

    c = db.execute(sql % (encode_pinyin(s1), encode_pinyin(y1), encode_pinyin(s2), encode_pinyin(y2)))
    for r in c:
        return r[0]
    return 0

def get_max_freq_1(db, p1):
    s1, y1 = get_sheng_yun(p1)

    sql = "select max(freq), phrase from py_phrase_0 where s0 = %d and y0 = %d"

    c = db.execute(sql % (encode_pinyin(s1), encode_pinyin(y1)))
    for r in c:
        return r[0] if r[0] else 0
    return 0

def compaired_special(pinyins):
    import sqlite3
    db = sqlite3.connect("open-phrase.db")
    # db = sqlite3.connect("main.db")

    for p1, p2, p3, p4 in get_all_special(pinyins):
        if p3 not in pinyin_list or p4 not in pinyin_list:
            continue
        if p1 not in pinyin_list or p2 not in pinyin_list:
            yield p1, p2, p3, p4
            continue

        if p3 not in pinyin_list or p4 not in pinyin_list:
            continue

        a1 = get_max_freq_2(db, p1, p2)
        a2 = get_max_freq_2(db, p3, p4)
        if a1 == a2:
            a1 = get_max_freq_1(db, p1) + get_max_freq_1(db, p2)
            a2 = get_max_freq_1(db, p3) + get_max_freq_1(db, p4)
        if a1 < a2:
            yield p1, p2, p3, p4

def gen_full_pinyin_table(pinyins):
    _dict = {}
    for i in xrange(0, len(pinyins)):
        _dict[pinyins[i]] = i
    full_pinyin = []
    for i in xrange(0, len(pinyins)):
        if pinyins[i][0] in pinyin_list:
            full_pinyin.append (pinyins[i])
    full_pinyin.sort(lambda a, b: (cmp(a[1], b[1]) << 16) + cmp(a[2],b[4]))
    print 'static const Pinyin *full_pinyin_table[] = {'
    for p in full_pinyin:
        print "    &pinyin_table[%d],    // %s" % (_dict[p], p[0])
    print '};'
    print


def gen_special_table(pinyins):
    _dict = {}
    for i in xrange(0, len(pinyins)):
        _dict[pinyins[i][0]] = i

    l = list(compaired_special(_dict.keys()))
    l.sort()
    print 'static const Pinyin *special_table[][4] = {'
    for r in l:
        ids =  [("&pinyin_table[%d]," % _dict[py]).ljust(20) for py in r]

        print '    { %s %s %s %s },' % tuple(ids), "/* %s %s => %s %s */" % r
    print '};'
    print


def main():
    # gen_header()
    # gen_macros()
    pinyins = gen_table()
    # gen_full_pinyin_table (pinyins)
    gen_bopomofo_table(pinyins)
    gen_special_table(pinyins)
    # gen_option_check("pinyin_option_check_sheng", fuzzy_shengmu)
    # gen_option_check("pinyin_option_check_yun", fuzzy_yunmu)


if __name__ == "__main__":
    main()