Skip to content
pyutil.py 4.82 KiB
Newer Older
# -*- coding: utf-8 -*-
# vim:set et sts=4 sw=4:
#
# libpyzy - The Chinese PinYin and Bopomofo conversion library.
#
# Copyright (c) 2007-2008 Peng Huang <shawn.p.huang@gmail.com>
#
Hiroshi Sumita's avatar
Hiroshi Sumita committed
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
Hiroshi Sumita's avatar
Hiroshi Sumita committed
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
Hiroshi Sumita's avatar
Hiroshi Sumita committed
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301

from pydict import *

class PinYinWord:
    correct_dict = {"nve" : "nue", "lve" : "lue"}
    def __init__ (self, pinyin):
        if pinyin in self.correct_dict:
            pinyin = self.correct_dict [pinyin]

        self._pinyin = pinyin
        self._is_completed = self.is_valid_pinyin ()
        if self._is_completed:
            sheng_mu, yun_mu = self.split ()
            self._pinyin_id = PINYIN_DICT [self._pinyin]
            self._sheng_mu_id = SHENGMU_DICT [sheng_mu]
        else:
            self._sheng_mu_id = SHENGMU_DICT [self._pinyin]

    def is_valid_pinyin (self):
        return PINYIN_DICT.has_key (self._pinyin)

    def get_sheng_mu_id (self):
        return self._sheng_mu_id

    def get_shengmu (self):
        return ID_SHENGMU_DICT[self._sheng_mu_id]

    def get_pinyin_id (self):
        return self._pinyin_id

    def get_pinyin (self):
        return self._pinyin

    def get_pattern (self, mohu = False):
        if mohu == False:
            if self.is_valid_pinyin ():
                return self._pinyin
            else:
                return self._pinyin + "%"
        else:
            if not self.is_valid_pinyin ():
                if self._pinyin in ("zh", "ch", "sh"):
                    return self._pinyin[0] + "%"
                return self._pinyin + "%"
            else:
                shengmu = self.get_shengmu ()
                yunmu = self._pinyin [len (shengmu):]
                if shengmu in ("zh", "ch", "sh", "z", "c", "s"):
                    shengmu = shengmu[0] + "%"
                if yunmu in ("ing", "in", "en", "eng", "an", "ang"):
                    yunmu = yunmu[0:2] + "%"
                return shengmu + yunmu

    def split (self):
        if not self.is_valid_pinyin ():
            raise Exception ("Pinyin '%s' is not a valid pinyin!" % py)
        if self._pinyin[:2] in SHENGMU_DICT.keys ():
            return self._pinyin[:2], self._pinyin[2:]
        elif self._pinyin[:1] in SHENGMU_DICT.keys ():
            return self._pinyin[:1], self._pinyin[1:]
        else:
            return "", self._pinyin[:]

    def __str__ (self):
        return self._pinyin

class PinYinString:
    def __init__ (self, string):
        pass

def load_pinyin_table (_file):

    def pinyin_table_parser (f):
        for l in f:
            a = unicode (l, "utf-8").strip ().split ()
            hanzi, pinyin, freq = a
            yield (hanzi, pinyin, int (freq))
    # db.add_phrases (pinyin_table_parser (bzf))

    hanzi_dic = {}
    for hanzi, pinyin, freq in pinyin_table_parser (_file):
        if not hanzi_dic.has_key (hanzi):
            hanzi_dic[hanzi] = {}

        if hanzi_dic[hanzi].has_key (pinyin):
            hanzi_dic[hanzi][pinyin] += freq
        else:
            hanzi_dic[hanzi][pinyin] = freq

    return hanzi_dic

def load_phrase_pinyin_freq (_file):
    def phrase_pinyin_parser (f):
        for l in f:
            phrase, pinyin, freq = unicode (l, "utf-8").strip ().split ()
            pinyin = pinyin.replace (u"u:", u"v")
            yield (phrase, pinyin, int (freq))
    phrases_dic = {}
    for phrase, pinyin, freq in phrase_pinyin_parser (_file):
        if not phrases_dic.has_key (phrase):
            phrases_dic[phrase] = []
        phrases_dic[phrase].append ((phrase, pinyin, freq))

    return phrases_dic

def load_phrase_pinyin (_file):
    def phrase_pinyin_parser (f):
        for l in f:
            phrase, pinyin = unicode (l, "utf-8").strip ().split ()
            pinyin = pinyin.replace (u"u:", u"v")
            yield (phrase, pinyin, 0)
    phrases_dic = {}
    for phrase, pinyin, freq in phrase_pinyin_parser (_file):
        if not phrases_dic.has_key (phrase):
            phrases_dic[phrase] = []
        phrases_dic[phrase].append ((phrase, pinyin, freq))

    return phrases_dic

def load_sogou_phrases (_file):
    import re
    dic = {}
    for l in _file:
        w = unicode (l, "utf8")
        w = re.split (ur"\t+", w)
        dic [w[0]] = (w[0], int (w[1]))
    return dic