/* vim:set et ts=4 sts=4: * * libpyzy - The Chinese PinYin and Bopomofo conversion library. * * Copyright (c) 2019 Yuanle Song * Copyright (c) 2008-2010 Peng Huang * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 * USA */ #include "Database.h" #include #include #include #include "Config.h" #include "PinyinArray.h" #include "Util.h" namespace PyZy { #define DB_CACHE_SIZE "5000" #define DB_INDEX_SIZE (3) // define columns #define DB_COLUMN_USER_FREQ (0) #define DB_COLUMN_PHRASE (1) #define DB_COLUMN_FREQ (2) #define DB_COLUMN_S0 (3) #define DB_PREFETCH_LEN (6) // in seconds #define DB_BACKUP_TIMEOUT (60) #define USER_DICTIONARY_FILE "user-1.0.db" #define SQLITE3_MEMORY_DB ":memory:" std::unique_ptr Database::m_instance; class Conditions : public std::vector { public: Conditions (void) : std::vector (1) {} void double_ (void) { size_t i = size (); // To avoid a invalid referece caused by a memory reallocation, which // may occur on push_back, we call reserve here. reserve (i * 2); do { push_back (at (--i)); } while (i > 0); } void triple (void) { size_t i = size (); // To avoid a invalid referece caused by a memory reallocation, which // may occur on push_back, we call reserve here. reserve (i * 3); do { const std::string & value = at (--i); push_back (value); push_back (value); } while (i > 0); } void appendVPrintf (size_t begin, size_t end, const char *fmt, va_list args) { char str[64]; g_vsnprintf (str, sizeof(str), fmt, args); for (size_t i = begin; i < end; i++) { at (i) += str; } } void appendPrintf (size_t begin, size_t end, const char *fmt, ...) { va_list args; va_start (args, fmt); appendVPrintf (begin, end, fmt, args); va_end (args); } }; class SQLStmt { public: SQLStmt (sqlite3 *db) : m_db (db), m_stmt (NULL) { g_assert (m_db != NULL); } ~SQLStmt () { if (m_stmt != NULL) { if (sqlite3_finalize (m_stmt) != SQLITE_OK) { g_warning ("delete prepared statement failed, " "error code: %d (%s)", sqlite3_errcode (m_db), sqlite3_errmsg (m_db)); } } } /** * return true on success, false otherwise. */ bool prepare (const String &sql) { int r = sqlite3_prepare ( m_db, sql.c_str (), sql.size (), &m_stmt, NULL); if (r != SQLITE_OK) { g_warning ("create prepare statement for sql %s failed: %d (%s)", sql.c_str (), r, sqlite3_errmsg (m_db)); return false; } return true; } /** * evaluate prepared statement. * * Returns: true if another row is available. * false if no rows available or there is an error. */ bool step (void) { switch (sqlite3_step (m_stmt)) { case SQLITE_ROW: return true; case SQLITE_DONE: return false; default: g_warning ("sqlite step failed: %d (%s)", sqlite3_errcode (m_db), sqlite3_errmsg (m_db)); return false; } } const char *columnText (int col) { return (const char *) sqlite3_column_text (m_stmt, col); } int columnInt (int col) { return sqlite3_column_int (m_stmt, col); } private: sqlite3 *m_db; sqlite3_stmt *m_stmt; }; Query::Query (const PinyinArray & pinyin, size_t pinyin_begin, size_t pinyin_len, unsigned int option) : m_pinyin (pinyin), m_pinyin_begin (pinyin_begin), m_pinyin_len (pinyin_len), m_option (option) { g_assert (m_pinyin.size () >= pinyin_begin + pinyin_len); } Query::~Query (void) { } /** * query db to find phrases for pinyin. * * @phrases: put query result in this PhraseArray. * @count: query at most this many phrases. * * Returns: how many phrases fetched from db. */ int Query::fill (PhraseArray &phrases, int count) { int row = 0; while (m_pinyin_len > 0) { if (G_LIKELY (m_stmt.get () == NULL)) { m_stmt = Database::instance ().query ( m_pinyin, m_pinyin_begin, m_pinyin_len, -1, m_option); if (m_stmt == nullptr) { return 0; } } while (m_stmt->step ()) { Phrase phrase; g_strlcpy (phrase.phrase, m_stmt->columnText (DB_COLUMN_PHRASE), sizeof (phrase.phrase)); phrase.freq = m_stmt->columnInt (DB_COLUMN_FREQ); phrase.user_freq = m_stmt->columnInt (DB_COLUMN_USER_FREQ); phrase.len = m_pinyin_len; for (size_t i = 0, column = DB_COLUMN_S0; i < m_pinyin_len; i++) { phrase.pinyin_id[i].sheng = m_stmt->columnInt (column++); phrase.pinyin_id[i].yun = m_stmt->columnInt (column++); } phrases.push_back (phrase); row ++; if (G_UNLIKELY (row == count)) { return row; } } m_stmt.reset (); m_pinyin_len --; } return row; } /** * attach db to dest handler as schema_name. * * Returns: true on success, false otherwise. */ static bool attach_db (sqlite3* dest, String db_file_to_attach, const gchar* schema_name) { g_message ("attaching db %s as %s", db_file_to_attach.c_str (), schema_name); char* sql = sqlite3_mprintf ("ATTACH DATABASE %Q AS %Q;", db_file_to_attach.c_str (), schema_name); gboolean result = sqlite3_exec_simple (dest, sql); sqlite3_free (sql); return result; } /** * return TRUE if file exists */ static gboolean file_exists (const char* filename) { return g_file_test (filename, G_FILE_TEST_EXISTS); } /** * return the first existing file in given file list. * return "" if none of the file exists. */ static String first_existing_file (const std::vector &files) { for (const String &fn : files) { if (file_exists (fn.c_str ())) { return fn; } } return ""; } /** * Returns: a main db file name if one exists. * an empty string otherwise. * * This function will look for file at these path: * m_user_data_dir + "/main.db" * {PKGDATADIR}/db/local.db * {PKGDATADIR}/db/open-phrase.db * {PKGDATADIR}/db/android.db * * In ibus-pinyin context those are: * ~/.cache/ibus/pinyin/main.db * /usr/share/pyzy/db/local.db * /usr/share/pyzy/db/open-phrase.db * /usr/share/pyzy/db/android.db */ String Database::getMainDBFile (void) { std::vector files; files.push_back (m_user_data_dir + "/main.db"); files.push_back (PKGDATADIR"/db/local.db"); files.push_back (PKGDATADIR"/db/open-phrase.db"); files.push_back (PKGDATADIR"/db/android.db"); return first_existing_file (files); } /** * set pragma on "maindb" and "userdb" on given sqlite3 db handler. * * see document at * https://www.sqlite.org/pragma.html * * Returns: true on success, false otherwise. */ static bool db_set_pragma (sqlite3* db) { g_debug ("setting pragma on db"); const char* sql = "PRAGMA maindb.temp_store=MEMORY;" "PRAGMA userdb.temp_store=MEMORY;"; char* errmsg = NULL; int r = sqlite3_exec (db, sql, NULL, NULL, &errmsg); sqlite3_free (errmsg); return r == SQLITE_OK; } /** * Create user db file and init the user db with tables and indices. * * Returns: TRUE on success, FALSE otherwise. */ gboolean Database::createUserDBFile () { g_return_val_if_fail (! m_user_data_dir.empty (), FALSE); g_return_val_if_fail (! m_user_db_file.empty (), FALSE); g_debug ("ensure dir exists:%s", m_user_data_dir.c_str ()); gint r = g_mkdir_with_parents (m_user_data_dir, 0750); if (r != 0) { g_warning ("create dir %s failed: %d (%s)", m_user_data_dir.c_str (), r, g_strerror (r)); return FALSE; } sqlite3 *userdb = NULL; g_message ("creating user db at %s", m_user_db_file.c_str ()); r = sqlite3_open (m_user_db_file, &userdb); if (r != SQLITE_OK) { sqlite3_close (userdb); return FALSE; } bool result = initUserDB (userdb, "main"); sqlite3_close (userdb); return result; } /** * init m_db handler, make main db accessible at "maindb" schema, * make user db accessible at "userdb" schema. * * This also sets m_main_db_file, m_user_db_file to correct file name. * * Returns: true on success, false otherwise. */ bool Database::initDB () { g_assert_null (m_db); if (m_db) return TRUE; sqlite3* db = NULL; int r = 0; r = sqlite3_open (SQLITE3_MEMORY_DB, &db); if (r) { g_warning ("sqlite3 open %s db failed", SQLITE3_MEMORY_DB); goto fail; } m_main_db_file = getMainDBFile(); if (m_main_db_file.empty()) { g_warning ("failed to find a main db file"); goto fail; } g_message ("found main db file at %s", m_main_db_file.c_str ()); r = attach_db (db, m_main_db_file, "maindb"); if (!r) { g_warning ("attach main db file %s failed", m_main_db_file.c_str ()); goto fail; } m_user_db_file = getUserDBFile(); g_assert (! m_user_db_file.empty()); if (! file_exists (m_user_db_file.c_str ())) { r = createUserDBFile (); if (!r) { g_warning ("create user db file failed"); // use a memory db for user db. // populate tables after attach. m_user_db_file = SQLITE3_MEMORY_DB; } } else { g_message ("found user db file at %s", m_user_db_file.c_str ()); } r = attach_db (db, m_user_db_file, "userdb"); if (!r) { g_warning ("attach user db file %s failed", m_user_db_file.c_str ()); goto fail; } if (m_user_db_file == SQLITE3_MEMORY_DB) { bool r = initUserDB (db, "userdb"); if (!r) { g_warning ("init in RAM user db failed"); g_assert_not_reached (); goto fail; } } r = db_set_pragma (db); if (!r) { g_warning ("set pragma on db failed"); // can continue } g_assert_nonnull (db); m_db = db; return true; fail: sqlite3_close (db); g_warning ("initDB() failed, query won't work"); return false; } String Database::getUserDBFile (void) { return m_user_data_dir + G_DIR_SEPARATOR_S + USER_DICTIONARY_FILE; } Database::Database (const std::string &user_data_dir) : m_db (NULL) , m_user_data_dir (user_data_dir) { initDB (); // init m_db } Database::~Database (void) { int r = sqlite3_close (m_db); if (r != SQLITE_OK) { g_warning ("close sqlite database failed: %d (%s)", r, sqlite3_errmsg (m_db)); } m_db = NULL; } /** * initialize user db. * create tables, index and populate data into desc table. * * Returns: true on success, false otherwise. */ bool Database::initUserDB (sqlite3 *userdb, const char* schema) { String sql; sql = "BEGIN TRANSACTION;\n"; // create desc table sql << "CREATE TABLE IF NOT EXISTS " << schema << ".desc (name PRIMARY KEY, value TEXT);\n"; sql << "INSERT OR IGNORE INTO " << schema << ".desc VALUES ('version', '1.2.0');\n" << "INSERT OR IGNORE INTO " << schema << ".desc VALUES ('uuid', '" << UUID () << "');\n" << "INSERT OR IGNORE INTO " << schema << ".desc VALUES ('hostname', '" << Hostname () << "');\n" << "INSERT OR IGNORE INTO " << schema << ".desc VALUES ('username', '" << Env ("USERNAME") << "');\n" << "INSERT OR IGNORE INTO " << schema << ".desc VALUES ('create-time', datetime());\n" << "INSERT OR IGNORE INTO " << schema << ".desc VALUES ('attach-time', datetime());\n"; /* create phrase tables */ for (size_t i = 0; i < MAX_PHRASE_LEN; i++) { sql.appendPrintf ("CREATE TABLE IF NOT EXISTS %s.py_phrase_%d (user_freq, phrase TEXT, freq INTEGER", schema, i); for (size_t j = 0; j <= i; j++) sql.appendPrintf (", s%d INTEGER, y%d INTEGER", j, j); sql << ");\n"; } /* create index */ sql << "CREATE UNIQUE INDEX IF NOT EXISTS " << schema << ".index_0_0 ON py_phrase_0(s0,y0,phrase);\n"; sql << "CREATE UNIQUE INDEX IF NOT EXISTS " << schema << ".index_1_0 ON py_phrase_1(s0,y0,s1,y1,phrase);\n"; sql << "CREATE INDEX IF NOT EXISTS " << schema << ".index_1_1 ON py_phrase_1(s0,s1,y1);\n"; for (size_t i = 2; i < MAX_PHRASE_LEN; i++) { sql << "CREATE UNIQUE INDEX IF NOT EXISTS " << schema << ".index_" << i << "_0 ON py_phrase_" << i << "(s0,y0"; for (size_t j = 1; j <= i; j++) sql << ",s" << j << ",y" << j; sql << ",phrase);\n"; sql << "CREATE INDEX IF NOT EXISTS " << schema << ".index_" << i << "_1 ON py_phrase_" << i << "(s0,s1,s2,y2);\n"; } sql << "COMMIT;"; return sqlite3_exec_simple (userdb, sql.c_str ()); } /** * this method is called whenever user db is modified. */ void Database::modified (void) { } /** * ? * * @option: fuzzy pinyin and auto correction flags. * @id: ? * @fid: ? */ static inline bool pinyin_option_check_sheng (guint option, guint id, guint fid) { switch ((id << 16) | fid) { case (PINYIN_ID_C << 16) | PINYIN_ID_CH: return (option & PINYIN_FUZZY_C_CH); case (PINYIN_ID_CH << 16) | PINYIN_ID_C: return (option & PINYIN_FUZZY_CH_C); case (PINYIN_ID_Z << 16) | PINYIN_ID_ZH: return (option & PINYIN_FUZZY_Z_ZH); case (PINYIN_ID_ZH << 16) | PINYIN_ID_Z: return (option & PINYIN_FUZZY_ZH_Z); case (PINYIN_ID_S << 16) | PINYIN_ID_SH: return (option & PINYIN_FUZZY_S_SH); case (PINYIN_ID_SH << 16) | PINYIN_ID_S: return (option & PINYIN_FUZZY_SH_S); case (PINYIN_ID_L << 16) | PINYIN_ID_N: return (option & PINYIN_FUZZY_L_N); case (PINYIN_ID_N << 16) | PINYIN_ID_L: return (option & PINYIN_FUZZY_N_L); case (PINYIN_ID_F << 16) | PINYIN_ID_H: return (option & PINYIN_FUZZY_F_H); case (PINYIN_ID_H << 16) | PINYIN_ID_F: return (option & PINYIN_FUZZY_H_F); case (PINYIN_ID_L << 16) | PINYIN_ID_R: return (option & PINYIN_FUZZY_L_R); case (PINYIN_ID_R << 16) | PINYIN_ID_L: return (option & PINYIN_FUZZY_R_L); case (PINYIN_ID_K << 16) | PINYIN_ID_G: return (option & PINYIN_FUZZY_K_G); case (PINYIN_ID_G << 16) | PINYIN_ID_K: return (option & PINYIN_FUZZY_G_K); default: return false; } } /** * ? * * @option: fuzzy pinyin and auto correction flags. * @id: ? * @fid: ? */ static inline bool pinyin_option_check_yun (guint option, guint id, guint fid) { switch ((id << 16) | fid) { case (PINYIN_ID_AN << 16) | PINYIN_ID_ANG: return (option & PINYIN_FUZZY_AN_ANG); case (PINYIN_ID_ANG << 16) | PINYIN_ID_AN: return (option & PINYIN_FUZZY_ANG_AN); case (PINYIN_ID_EN << 16) | PINYIN_ID_ENG: return (option & PINYIN_FUZZY_EN_ENG); case (PINYIN_ID_ENG << 16) | PINYIN_ID_EN: return (option & PINYIN_FUZZY_ENG_EN); case (PINYIN_ID_IN << 16) | PINYIN_ID_ING: return (option & PINYIN_FUZZY_IN_ING); case (PINYIN_ID_ING << 16) | PINYIN_ID_IN: return (option & PINYIN_FUZZY_ING_IN); case (PINYIN_ID_IAN << 16) | PINYIN_ID_IANG: return (option & PINYIN_FUZZY_IAN_IANG); case (PINYIN_ID_IANG << 16) | PINYIN_ID_IAN: return (option & PINYIN_FUZZY_IANG_IAN); case (PINYIN_ID_UAN << 16) | PINYIN_ID_UANG: return (option & PINYIN_FUZZY_UAN_UANG); case (PINYIN_ID_UANG << 16) | PINYIN_ID_UAN: return (option & PINYIN_FUZZY_UANG_UAN); default: return false; } } /** * create a SQLStmt that is prepared to query candidates for given pinyin. * * @pinyin: the pinyin to query for * @pinyin_begin: ? * @pinyin_len: ? * @m: ? * @option: whether to enable fuzzy pinyin and auto correction when finding * matches. * * Returns: a shared_ptr on success. or a reset shared_ptr * on failure. */ SQLStmtPtr Database::query (const PinyinArray &pinyin, size_t pinyin_begin, size_t pinyin_len, int m, guint option) { if (! m_db) { g_warning ("Error: can't query db " "because db init failed"); return nullptr; } g_assert (pinyin_begin < pinyin.size ()); g_assert (pinyin_len <= pinyin.size () - pinyin_begin); g_assert (pinyin_len <= MAX_PHRASE_LEN); /* prepare sql */ Conditions conditions; for (size_t i = 0; i < pinyin_len; i++) { const Pinyin *p; bool fs1, fs2; p = pinyin[i + pinyin_begin]; fs1 = pinyin_option_check_sheng (option, p->pinyin_id[0].sheng, p->pinyin_id[1].sheng); fs2 = pinyin_option_check_sheng (option, p->pinyin_id[0].sheng, p->pinyin_id[2].sheng); if (G_LIKELY (i > 0)) conditions.appendPrintf (0, conditions.size (), " AND "); if (G_UNLIKELY (fs1 || fs2)) { if (G_LIKELY (i < DB_INDEX_SIZE)) { if (fs1 && fs2 == 0) { conditions.double_ (); conditions.appendPrintf (0, conditions.size () >> 1, "s%d=%d", i, p->pinyin_id[0].sheng); conditions.appendPrintf (conditions.size () >> 1, conditions.size (), "s%d=%d", i, p->pinyin_id[1].sheng); } else if (fs1 == 0 && fs2) { conditions.double_ (); conditions.appendPrintf (0, conditions.size () >> 1, "s%d=%d", i, p->pinyin_id[0].sheng); conditions.appendPrintf (conditions.size () >> 1, conditions.size (), "s%d=%d", i, p->pinyin_id[2].sheng); } else { size_t len = conditions.size (); conditions.triple (); conditions.appendPrintf (0, len, "s%d=%d", i, p->pinyin_id[0].sheng); conditions.appendPrintf (len, len << 1, "s%d=%d", i, p->pinyin_id[1].sheng); conditions.appendPrintf (len << 1, conditions.size (), "s%d=%d", i, p->pinyin_id[2].sheng); } } else { if (fs1 && fs2 == 0) { conditions.appendPrintf (0, conditions.size (), "s%d IN (%d,%d)", i, p->pinyin_id[0].sheng, p->pinyin_id[1].sheng); } else if (fs1 == 0 && fs2) { conditions.appendPrintf (0, conditions.size (), "s%d IN (%d,%d)", i, p->pinyin_id[0].sheng, p->pinyin_id[2].sheng); } else { conditions.appendPrintf (0, conditions.size (), "s%d IN (%d,%d,%d)", i, p->pinyin_id[0].sheng, p->pinyin_id[1].sheng, p->pinyin_id[2].sheng); } } } else { conditions.appendPrintf (0, conditions.size (), "s%d=%d", i, p->pinyin_id[0].sheng); } if (p->pinyin_id[0].yun != PINYIN_ID_ZERO) { if (pinyin_option_check_yun (option, p->pinyin_id[0].yun, p->pinyin_id[1].yun)) { if (G_LIKELY (i < DB_INDEX_SIZE)) { conditions.double_ (); conditions.appendPrintf (0, conditions.size () >> 1, " AND y%d=%d", i, p->pinyin_id[0].yun); conditions.appendPrintf (conditions.size () >> 1, conditions.size (), " and y%d=%d", i, p->pinyin_id[1].yun); } else { conditions.appendPrintf (0, conditions.size (), " AND y%d IN (%d,%d)", i, p->pinyin_id[0].yun, p->pinyin_id[1].yun); } } else { conditions.appendPrintf (0, conditions.size (), " AND y%d=%d", i, p->pinyin_id[0].yun); } } } String sql_condition; sql_condition.clear (); for (size_t i = 0; i < conditions.size (); i++) { if (G_UNLIKELY (i == 0)) sql_condition << " (" << conditions[i] << ")\n"; else sql_condition << " OR (" << conditions[i] << ")\n"; } String sql; int id = pinyin_len - 1; sql << "SELECT * FROM (" "SELECT 0 AS user_freq, * FROM maindb.py_phrase_" << id << " WHERE " << sql_condition << " UNION ALL " "SELECT * FROM userdb.py_phrase_" << id << " WHERE " << sql_condition << ") " "GROUP BY phrase ORDER BY user_freq DESC, freq DESC "; if (m > 0) sql << "LIMIT " << m; g_debug ("sql=\n%s", sql.c_str ()); /* query database */ SQLStmtPtr stmt (new SQLStmt (m_db)); if (!stmt->prepare (sql)) { stmt.reset (); } return stmt; } void Database::phraseWhereSql (const Phrase &p, String &sql) { // escape and add single quote on both sides char* phrase = sqlite3_mprintf("%Q", p.phrase); sql << " WHERE"; sql << " s0=" << p.pinyin_id[0].sheng << " AND y0=" << p.pinyin_id[0].yun; for (size_t i = 1; i < p.len; i++) { sql << " AND s" << i << '=' << p.pinyin_id[i].sheng << " AND y" << i << '=' << p.pinyin_id[i].yun; } sql << " AND phrase=" << phrase; sqlite3_free (phrase); } void Database::phraseSql (const Phrase &p, String &sql) { // escape and add single quote on both sides char* phrase = sqlite3_mprintf("%Q", p.phrase); sql << "INSERT OR IGNORE INTO userdb.py_phrase_" << p.len - 1 << " VALUES(" << 0 /* user_freq */ << "," << phrase /* phrase */ << ',' << p.freq; /* freq */ sqlite3_free (phrase); for (size_t i = 0; i < p.len; i++) { sql << ',' << p.pinyin_id[i].sheng << ',' << p.pinyin_id[i].yun; } sql << ");\n"; sql << "UPDATE userdb.py_phrase_" << p.len - 1 << " SET user_freq=MIN(user_freq+1, 65535)"; phraseWhereSql (p, sql); sql << ";\n"; } /** * insert phrases to userdb and increment user_freq field for each phrase. */ void Database::commit (const PhraseArray &phrases) { if (! m_db) { g_warning ("Error: can't commit new phrase " "because db init failed"); return; } Phrase phrase = {""}; String sql = "BEGIN TRANSACTION;\n"; for (size_t i = 0; i < phrases.size (); i++) { phrase += phrases[i]; phraseSql (phrases[i], sql); } if (phrases.size () > 1) phraseSql (phrase, sql); sql << "COMMIT;\n"; gboolean r = sqlite3_exec_simple (m_db, sql); if (r) { modified (); } else { g_warning ("insert phrases to (or update freq for) " "userdb failed"); } } /** * remove phrase from userdb. */ void Database::remove (const Phrase & phrase) { if (! m_db) { g_warning ("Error: can't remove phrase " "because db init failed"); return; } String sql; sql << "BEGIN;\n" "DELETE FROM userdb.py_phrase_" << phrase.len - 1; phraseWhereSql (phrase, sql); sql << ";\n" "COMMIT;\n"; gboolean r = sqlite3_exec_simple (m_db, sql); if (r) { modified (); } else { g_warning ("remove phrase %s from userdb failed", phrase.phrase); } } /** * create Database singleton instance at Database::m_instance. * * upstream should check the return value for error. if false is returned, * upstream app should not try to access the db. * * Returns: true on success, false otherwise. */ bool Database::init (const std::string & user_data_dir) { if (m_instance.get () == NULL) { m_instance.reset (new Database (user_data_dir)); } if (! m_instance->m_db) { // try initDB() again if previous call in Database constructor // failed. return m_instance->initDB (); } return true; } void Database::finalize (void) { m_instance.reset (NULL); } }; // namespace PyZy