Newer
Older
#include "zero-pinyin-service.h"
#include "parse-pinyin.h"
get_candidates_test(const char *preedit_str,
const guint fetch_size,
GVariantBuilder *candidates_builder,
GVariantBuilder *matched_lengths_builder)
const gchar *matches[] = {"李易峰", "利益", "礼仪", "离异", "里", "理", "力"};
guint matched_lengths[] = {8, 4, 4, 4, 2, 2, 2};
for (guint i = 0; i < G_N_ELEMENTS(matches); ++i) {
g_variant_builder_add(candidates_builder, "s", matches[i]);
g_variant_builder_add(matched_lengths_builder, "u", matched_lengths[i]);
const gchar *matches[] = {"风", "封", "疯", "丰", "凤"};
guint matched_lengths[] = {4, 4, 4, 4, 4, 4};
for (guint i = 0; i < G_N_ELEMENTS(matches); ++i) {
g_variant_builder_add(candidates_builder, "s", matches[i]);
g_variant_builder_add(matched_lengths_builder, "u", matched_lengths[i]);
} else if (g_str_equal(preedit_str, "yifeng")) {
const gchar *matches[] = {"一封", "遗风", "艺", "依", "一", "以"};
guint matched_lengths[] = {6, 6, 2, 2, 2, 2};
for (guint i = 0; i < G_N_ELEMENTS(matches); ++i) {
g_variant_builder_add(candidates_builder, "s", matches[i]);
g_variant_builder_add(matched_lengths_builder, "u", matched_lengths[i]);
}
}
}
/**
* build where clause for build_sql_for_n_pinyin().
*
* @pylist: the pinyin list.
* @n: number of Pinyin to use in pylist.
*
* returns: where_clause, caller should g_free() result after use.
*/
static char *
build_where_clause(GList *pylist,
const guint n)
gboolean first_condition_done = FALSE;
Pinyin *thispy = NULL;
s = g_string_new(NULL);
for (guint i = 0; i < n; ++i) {
g_assert_nonnull(iter);
thispy = (Pinyin *) iter->data;
if (thispy->shengmu_i) {
if (G_LIKELY(first_condition_done)) {
g_string_append_printf(s, "AND s%u=%d ", i, thispy->shengmu_i);
g_string_append_printf(s, "s%u=%d ", i, thispy->shengmu_i);
first_condition_done = TRUE;
}
}
if (thispy->yunmu_i) {
if (G_LIKELY(first_condition_done)) {
g_string_append_printf(s, "AND y%u=%d ", i, thispy->yunmu_i);
g_string_append_printf(s, "y%u=%d ", i, thispy->yunmu_i);
first_condition_done = TRUE;
}
}
iter = iter->next;
}
gchar *result = s->str;
g_string_free(s, FALSE);
return result;
}
/**
* return a string like ", s0, y0, s1, y1 "
*
* caller should g_free() result after use.
*/
g_assert_cmpint(n, >=, 1);
s = g_string_new(NULL);
/**
* build a SQL to query candidates for first n pinyin in pylist.
* n can be from 1 to len(pylist).
*
* caller should free result with g_free() after use.
*/
static char *
build_sql_for_n_pinyin(GList *pylist,
const guint n,
const guint limit)
/* always keep one space after current term */
GString *sql = NULL;
gchar *where_clause = NULL;
sql = g_string_new("SELECT MAX(user_freq) AS user_freq, "
"phrase, MAX(freq) AS freq");
gchar *s_y_fields = build_s_y_fields(n);
g_string_append_printf(sql, s_y_fields);
g_string_append_printf(sql, "FROM (");
g_string_append_printf(
sql, "SELECT 0 AS user_freq, phrase, freq");
g_string_append_printf(sql, s_y_fields);
g_string_append_printf(
sql, "FROM maindb.py_phrase_%u WHERE ", n - 1);
where_clause = build_where_clause(pylist, n);
g_assert_nonnull(where_clause);
g_debug("where_clause=%s", where_clause);
sql = g_string_append(sql, where_clause);
sql = g_string_append(sql, "UNION ");
g_string_append_printf(
g_string_append_printf(sql, s_y_fields);
g_string_append_printf(
sql, "FROM userdb.py_phrase_%u WHERE ", n - 1);
sql = g_string_append(sql, where_clause);
sql = g_string_append(
sql, ") "
"WHERE phrase NOT IN (SELECT phrase FROM userdb.not_phrase) "
"GROUP BY phrase "
"ORDER BY user_freq DESC, freq DESC ");
g_string_append_printf(sql, "LIMIT %u;", limit);
char *result = sql->str;
g_free(s_y_fields);
g_free(where_clause);
g_string_free(sql, FALSE);
return result;
}
/**
* For a candidate of length group_size, calculate the matched py length.
*
* This is part of get_candidates_for_n_pinyin().
*
* see param meaning there.
*/
static guint
get_matched_py_length(const char *preedit_str,
GList *pylist,
const guint group_size)
/* For usual pinyin string, just add up the Pinyin length. But for
* pinyin that contains ', when a Pinyin in pylist is used, also take
* the ' before and after it. */
for (guint i = 0; i < group_size; ++i) {
while (preedit_str[matched_py_length] == '\'') {
matched_py_length++;
}
matched_py_length += ((Pinyin *) iter->data)->length;
while (preedit_str[matched_py_length] == '\'') {
matched_py_length++;
}
iter = iter->next;
}
return matched_py_length;
}
/**
* fetch candidates for a fixed word length.
*
* @db: sqlite3 db handler.
* @preedit_str: the pinyin preedit str. can contain '. This is needed to
* calculate matched_py_length.
* @pylist: the pinyin list.
* @group_size: the fixed word length. use this many pinyin from pinyin list.
* @limit: fetch this many result is enough for user. more is not a problem though.
* @candidates: the result candidate list. caller should free this after use.
*
* returns: how many candidates fetched.
*/
static guint
get_candidates_for_n_pinyin(sqlite3 *db,
const char *preedit_str,
GList *pylist,
const guint group_size,
const guint limit,
GList **candidates)
{
const guint DEFAULT_LIMIT = 50;
g_assert_nonnull(db);
g_assert_cmpint(group_size, >=, 1);
g_assert_cmpint(group_size, <=, g_list_length(pylist));
gint candidates_count = 0;
gint r = 0;
/* build SQL and run SQL query */
char *sql = NULL;
sql = build_sql_for_n_pinyin(pylist, group_size, MAX(limit, DEFAULT_LIMIT));
g_debug("build_sql_for_n_pinyin result SQL:\n\n%s\n", sql);
guint matched_py_length = get_matched_py_length(preedit_str, pylist, group_size);
sqlite3_stmt *stmt = NULL;
const char *unused;
Candidate *c = NULL;
r = sqlite3_prepare_v2(db, sql, -1, &stmt, &unused);
g_assert_nonnull(unused);
g_assert_cmpstr(unused, ==, "");
if (strlen(unused)) {
g_warning("part of sql is unused \"%s\" length=%zu",
unused, strlen(unused));
if (r == SQLITE_DONE) {
break;
} else if (r == SQLITE_ROW) {
/* sql SELECT should select these columns in order */
c->user_freq = sqlite3_column_int(stmt, 0);
c->str = g_strdup((const char *) sqlite3_column_text(stmt, 1));
c->freq = sqlite3_column_int(stmt, 2);
c->matched_py_length = matched_py_length;
c->py_indices = g_malloc0(sizeof(Pinyin *) * group_size);
c->py_indices[i] = g_new0(Pinyin, 1);
c->py_indices[i]->shengmu_i = sqlite3_column_int(stmt, 3 + i * 2);
c->py_indices[i]->yunmu_i = sqlite3_column_int(stmt, 4 + i * 2);
/* we don't care about ->length field */
}
if (g_utf8_validate(c->str, -1, NULL)) {
result = g_list_prepend(result, c);
candidates_count++;
} else {
g_warning("ignore non utf8 phrase: %s", c->str);
}
} else if (r == SQLITE_BUSY) {
break;
} else {
g_warning("sqlite3_step error: %d (%s)",
r, sqlite3_errmsg(db));
break;
}
}
if (r != SQLITE_OK) {
g_debug("sqlite3_finalize error: %d (%s)", r, sqlite3_errmsg(db));
}
/* store query result in a new GList */
return candidates_count;
}
add_candidate_to_builders(Candidate *c,
GVariantBuilder *candidates_builder,
GVariantBuilder *matched_lengths_builder,
GVariantBuilder *candidates_pinyin_indices)
g_variant_builder_add(candidates_builder, "s", c->str);
g_variant_builder_add(matched_lengths_builder, "u", c->matched_py_length);
GVariantBuilder *py_indices_builder = NULL;
py_indices_builder = g_variant_builder_new(G_VARIANT_TYPE("a(ii)"));
py_indices_builder, "(ii)",
c->py_indices[i]->shengmu_i,
c->py_indices[i]->yunmu_i);
g_debug("adding (ii) %d %d",
c->py_indices[i]->shengmu_i,
c->py_indices[i]->yunmu_i);
g_free(c->py_indices[i]);
g_debug("adding a(ii) to aa(ii)");
g_variant_builder_add(candidates_pinyin_indices, "a(ii)",
py_indices_builder);
g_variant_builder_unref(py_indices_builder);
g_free(c->str);
g_free(c->py_indices);
get_candidates(sqlite3 *db,
const char *preedit_str,
const guint fetch_size,
GVariantBuilder *candidates_builder,
GVariantBuilder *matched_lengths_builder,
GVariantBuilder *candidates_pinyin_indices)
g_warning("No db connection, can't get candidates.");
guint pylist_len = 0;
const guint FLAGS = (PINYIN_INCOMPLETE_PINYIN |
PINYIN_FUZZY_C_CH |
PINYIN_FUZZY_CH_C |
PINYIN_FUZZY_Z_ZH |
PINYIN_FUZZY_ZH_Z |
PINYIN_FUZZY_S_SH |
PINYIN_FUZZY_SH_S |
PINYIN_FUZZY_IN_ING |
PINYIN_FUZZY_ING_IN |
PINYIN_CORRECT_ALL);
pylist = parse_pinyin(preedit_str, 15, FLAGS);
guint group_size = pylist_len;
guint fetched_size = 0;
guint r = 0;
while (fetched_size < fetch_size && group_size > 0) {
r = get_candidates_for_n_pinyin(db, preedit_str, pylist, group_size, fetch_size - fetched_size, &candidates);
if (candidates) {
GList *iter = g_list_first(candidates);
Candidate *c = NULL;
while (iter != NULL) {
c = (Candidate *) iter->data;
add_candidate_to_builders(
c, candidates_builder,
matched_lengths_builder,
candidates_pinyin_indices);
iter = iter->next;
}
fetched_size += r;
group_size--;
}
g_info("returning %u candidates", fetched_size);
/**
* sub function for commit_candidate()
*/
static void
_update_userdb_py_phrase(sqlite3 *db,
const gchar *candidate,
GVariant *candidate_pinyin_indices,
guint len) /* utf8 length of candidate char */
{
GString *sql = NULL;
GVariantIter iter = {0};
GVariant *child = NULL;
gint x = 0;
gint y = 0;
guint count = 0;
char *s = NULL;
gboolean rb = FALSE;
g_assert_nonnull(db);
g_assert_nonnull(candidate);
g_assert_nonnull(candidate_pinyin_indices);
sql = g_string_new(NULL);
g_string_append_printf(sql, "INSERT OR IGNORE INTO userdb.py_phrase_%u (user_freq, phrase, freq", len - 1);
gchar *s_y_fields = build_s_y_fields(len);
sql = g_string_append(sql, s_y_fields);
g_free(s_y_fields);
s = sqlite3_mprintf(") VALUES (0, %Q, 0", candidate);
sql = g_string_append(sql, s);
sqlite3_free(s);
g_variant_iter_init(&iter, candidate_pinyin_indices);
while ((child = g_variant_iter_next_value(&iter))) {
g_variant_get(child, "(ii)", &x, &y);
g_string_append_printf(sql, ", %d, %d", x, y);
g_warning("candidate length=%u, a(ii) length=%u, mismatch!",
len, count);
g_string_free(sql, TRUE);
g_assert_not_reached();
g_string_append_printf(sql, ");");
rb = sqlite3_exec_simple(db, sql->str);
g_warning("INSERT candidate to userdb failed");
g_debug("candidate %s inserted to userdb", candidate);
/* increment user_freq field for candidate */
sql = g_string_new(NULL);
g_string_append_printf(sql, "UPDATE userdb.py_phrase_%u "
"SET user_freq = user_freq + 1 ", len - 1);
s = sqlite3_mprintf("WHERE phrase = %Q ", candidate);
sql = g_string_append(sql, s);
sqlite3_free(s);
g_variant_iter_init(&iter, candidate_pinyin_indices);
while ((child = g_variant_iter_next_value(&iter))) {
g_variant_get(child, "(ii)", &x, &y);
g_string_append_printf(sql, "AND s%d=%d AND y%d=%d ",
count, x, count, y);
sql = g_string_append(sql, ";");
rb = sqlite3_exec_simple(db, sql->str);
g_warning("UPDATE candidate user_freq failed");
g_info("candidate %s user_freq incremented", candidate);
g_warning("UPDATE candidate user_freq failed, no match");
_update_userdb_not_phrase(sqlite3 *db,
const gchar *candidate)
g_assert_nonnull(db);
g_assert_nonnull(candidate);
char *sql = sqlite3_mprintf("DELETE FROM userdb.not_phrase WHERE phrase = %Q;", candidate);
rb = sqlite3_exec_simple(db, sql);
g_warning("DELETE candidate from not_phrase failed");
g_debug("candidate %s removed from not_phrase", candidate);
commit_candidate(sqlite3 *db,
const gchar *candidate,
GVariant *candidate_pinyin_indices)
g_warning("No db connection, can't commit candidates.");
g_warning("candidate should not be NULL. won't commit candidate.");
return;
}
if (! candidate_pinyin_indices) {
g_warning("candidate_pinyin_indices should not be NULL. won't commit candidate.");
g_info("commit single character %s is a no-op", candidate);
_update_userdb_py_phrase(db, candidate, candidate_pinyin_indices, len);
_update_userdb_not_phrase(db, candidate);