Skip to content
zero-pinyin-service.c 15.7 KiB
Newer Older
#include "zero-pinyin-service.h"
#include "parse-pinyin.h"
#include "../sqlite3_util.h"
#include "pinyin-id.h"
Yuanle Song's avatar
Yuanle Song committed
get_candidates_test(const char *preedit_str,
		    const guint fetch_size,
		    GVariantBuilder *candidates_builder,
		    GVariantBuilder *matched_lengths_builder)
Yuanle Song's avatar
Yuanle Song committed
	if (g_str_equal(preedit_str, "liyifeng")) {
		const gchar *matches[] = {"李易峰", "利益", "礼仪", "离异", "里", "理", "力"};
		guint matched_lengths[] = {8, 4, 4, 4, 2, 2, 2};
Yuanle Song's avatar
Yuanle Song committed
		for (guint i = 0; i < G_N_ELEMENTS(matches); ++i) {
			g_variant_builder_add(candidates_builder, "s", matches[i]);
			g_variant_builder_add(matched_lengths_builder, "u", matched_lengths[i]);
Yuanle Song's avatar
Yuanle Song committed
	} else if (g_str_equal(preedit_str, "feng")) {
		const gchar *matches[] = {"风", "封", "疯", "丰", "凤"};
		guint matched_lengths[] = {4, 4, 4, 4, 4, 4};
Yuanle Song's avatar
Yuanle Song committed
		for (guint i = 0; i < G_N_ELEMENTS(matches); ++i) {
			g_variant_builder_add(candidates_builder, "s", matches[i]);
			g_variant_builder_add(matched_lengths_builder, "u", matched_lengths[i]);
Yuanle Song's avatar
Yuanle Song committed
	} else if (g_str_equal(preedit_str, "yifeng")) {
		const gchar *matches[] = {"一封", "遗风", "艺", "依", "一", "以"};
		guint matched_lengths[] = {6, 6, 2, 2, 2, 2};
Yuanle Song's avatar
Yuanle Song committed
		for (guint i = 0; i < G_N_ELEMENTS(matches); ++i) {
			g_variant_builder_add(candidates_builder, "s", matches[i]);
			g_variant_builder_add(matched_lengths_builder, "u", matched_lengths[i]);
/**
 * get pinyin's fuzzy pair.
 * for example, zh for z.
 */
gint
get_fuzzy_pair(gint pinyin_id)
{
	switch (pinyin_id) {
	case PINYIN_ID_Z: return PINYIN_ID_ZH;
	case PINYIN_ID_ZH: return PINYIN_ID_Z;
	case PINYIN_ID_C: return PINYIN_ID_CH;
	case PINYIN_ID_CH: return PINYIN_ID_C;
	case PINYIN_ID_S: return PINYIN_ID_SH;
	case PINYIN_ID_SH: return PINYIN_ID_S;
	case PINYIN_ID_L: return PINYIN_ID_N;
	case PINYIN_ID_N: return PINYIN_ID_L;
	default:
		g_assert_not_reached();
		return pinyin_id;
	}
}

/**
 * build where clause for build_sql_for_n_pinyin().
 *
 * @pylist: the pinyin list.
 * @fuzzy_flag: see dbus interface FuzzyFlag property.
 * @n: number of Pinyin to use in pylist.
 *
 * returns: where_clause, caller should g_free() result after use.
 */
Yuanle Song's avatar
Yuanle Song committed
static char *
build_where_clause(GList *pylist,
		   const guint fuzzy_flag,
Yuanle Song's avatar
Yuanle Song committed
		   const guint n)
Yuanle Song's avatar
Yuanle Song committed
	GString *s = NULL;
	GList *iter = pylist;
	Pinyin *thispy = NULL;
Yuanle Song's avatar
Yuanle Song committed
	s = g_string_new(NULL);
	/* allow append "AND something" without checking */
	g_string_append_printf(s, "1=1 ");
	for (guint i = 0; i < n; ++i) {
Yuanle Song's avatar
Yuanle Song committed
		g_assert_nonnull(iter);
		thispy = (Pinyin *) iter->data;
		/* do not allow omit shengmu. always do strict match */
		switch (thispy->shengmu_i) {
		case PINYIN_ID_Z:
		case PINYIN_ID_C:
		case PINYIN_ID_S:
		case PINYIN_ID_ZH:
		case PINYIN_ID_CH:
		case PINYIN_ID_SH:
			if (fuzzy_flag & FUZZY_FLAG_ZCS_ZHCHSH) {
				g_string_append_printf(
					s,
					"AND (s%u=%d OR s%u=%d) ",
					i, thispy->shengmu_i,
					i, get_fuzzy_pair(thispy->shengmu_i));
			} else {
				goto NO_FUZZY;
			}
			break;
		case PINYIN_ID_L:
		case PINYIN_ID_N:
			if (fuzzy_flag & FUZZY_FLAG_L_N) {
				g_string_append_printf(
					s,
					"AND (s%u=%d OR s%u=%d) ",
					i, thispy->shengmu_i,
					i, get_fuzzy_pair(thispy->shengmu_i));
			} else {
				goto NO_FUZZY;
			}
			break;
		default:
		NO_FUZZY:
			g_string_append_printf(s, "AND s%u=%d ", i, thispy->shengmu_i);
		/* allow omit yunmu, if 0 don't match on it */
			g_string_append_printf(s, "AND y%u=%d ", i, thispy->yunmu_i);
Yuanle Song's avatar
Yuanle Song committed
	gchar *result = s->str;
	g_string_free(s, FALSE);
/**
 * return a string like ", s0, y0, s1, y1 "
 *
 * caller should g_free() result after use.
 */
Yuanle Song's avatar
Yuanle Song committed
char *
build_s_y_fields(const guint n)
{
	GString *s = NULL;
Yuanle Song's avatar
Yuanle Song committed
	g_assert_cmpint(n, >=, 1);
	s = g_string_new(NULL);
	for (guint i = 0; i < n; ++i) {
Yuanle Song's avatar
Yuanle Song committed
		g_string_append_printf(s, ", s%u, y%u", i, i);
Yuanle Song's avatar
Yuanle Song committed
	s = g_string_append(s, " ");
	gchar *result = s->str;
Yuanle Song's avatar
Yuanle Song committed
	g_string_free(s, FALSE);
	return result;
}

/**
 * build a SQL to query candidates for first n pinyin in pylist.
 * n can be from 1 to len(pylist).
 *
 * caller should free result with g_free() after use.
 */
Yuanle Song's avatar
Yuanle Song committed
static char *
build_sql_for_n_pinyin(GList *pylist,
		       const guint fuzzy_flag,
Yuanle Song's avatar
Yuanle Song committed
		       const guint n,
		       const guint limit)
	/* always keep one space after current term */
Yuanle Song's avatar
Yuanle Song committed
	GString *sql = NULL;
	gchar *where_clause = NULL;
	sql = g_string_new("SELECT MAX(user_freq) AS user_freq, "
			   "phrase, MAX(freq) AS freq");
	gchar *s_y_fields = build_s_y_fields(n);
	g_string_append_printf(sql, s_y_fields);
	g_string_append_printf(sql, "FROM (");
	g_string_append_printf(
		sql, "SELECT 0 AS user_freq, phrase, freq");
Yuanle Song's avatar
Yuanle Song committed
	g_string_append_printf(sql, s_y_fields);
	g_string_append_printf(
		sql, "FROM maindb.py_phrase_%u WHERE ", n - 1);
	where_clause = build_where_clause(pylist, fuzzy_flag, n);
Yuanle Song's avatar
Yuanle Song committed
	g_assert_nonnull(where_clause);
	g_debug("where_clause=%s", where_clause);
	sql = g_string_append(sql, where_clause);
	sql = g_string_append(sql, "UNION ");
	g_string_append_printf(
		sql, "SELECT user_freq, phrase, freq");
Yuanle Song's avatar
Yuanle Song committed
	g_string_append_printf(sql, s_y_fields);
	g_string_append_printf(
		sql, "FROM userdb.py_phrase_%u WHERE ", n - 1);
Yuanle Song's avatar
Yuanle Song committed
	sql = g_string_append(sql, where_clause);
	sql = g_string_append(
		      sql, ") "
		      "WHERE phrase NOT IN (SELECT phrase FROM userdb.not_phrase) "
		      "GROUP BY phrase "
		      "ORDER BY user_freq DESC, freq DESC ");
	g_string_append_printf(sql, "LIMIT %u;", limit);
	char *result = sql->str;
	g_free(s_y_fields);
	g_free(where_clause);
	g_string_free(sql, FALSE);
/**
 * For a candidate of length group_size, calculate the matched py length.
 *
 * This is part of get_candidates_for_n_pinyin().
 *
 * see param meaning there.
 */
static guint
Yuanle Song's avatar
Yuanle Song committed
get_matched_py_length(const char *preedit_str,
		      GList *pylist,
		      const guint group_size)
{
	guint matched_py_length = 0;
Yuanle Song's avatar
Yuanle Song committed
	GList *iter = pylist;
Yuanle Song's avatar
Yuanle Song committed
	g_assert_cmpint(group_size, >=, 1);
	/* For usual pinyin string, just add up the Pinyin length. But for
	 * pinyin that contains ', when a Pinyin in pylist is used, also take
	 * the ' before and after it. */
	for (guint i = 0; i < group_size; ++i) {
		while (preedit_str[matched_py_length] == '\'') {
			matched_py_length++;
		}
Yuanle Song's avatar
Yuanle Song committed
		matched_py_length += ((Pinyin *) iter->data)->length;
		while (preedit_str[matched_py_length] == '\'') {
			matched_py_length++;
		}
		iter = iter->next;
	}
	return matched_py_length;
}

/**
 * fetch candidates for a fixed word length.
 *
 * @db: sqlite3 db handler.
 * @preedit_str: the pinyin preedit str. can contain '. This is needed to
 *               calculate matched_py_length.
 * @pylist: the pinyin list.
 * @fuzzy_flag: see dbus interface FuzzyFlag property.
 * @group_size: the fixed word length. use this many pinyin from pinyin list.
 * @limit: fetch this many result is enough for user. more is not a problem though.
 * @candidates: the result candidate list. caller should free this after use.
 *
 * returns: how many candidates fetched.
 */
static guint
Yuanle Song's avatar
Yuanle Song committed
get_candidates_for_n_pinyin(sqlite3 *db,
			    const char *preedit_str,
			    GList *pylist,
			    const guint fuzzy_flag,
Yuanle Song's avatar
Yuanle Song committed
			    const guint group_size,
			    const guint limit,
			    GList **candidates)
{
	const guint DEFAULT_LIMIT = 50;
Yuanle Song's avatar
Yuanle Song committed
	GList *result = NULL;	/* GList of Candidate */
Yuanle Song's avatar
Yuanle Song committed
	g_assert_nonnull(db);
	g_assert_cmpint(group_size, >=, 1);
	g_assert_cmpint(group_size, <=, g_list_length(pylist));

	gint candidates_count = 0;
	gint r = 0;
	/* build SQL and run SQL query */
Yuanle Song's avatar
Yuanle Song committed
	char *sql = NULL;
	sql = build_sql_for_n_pinyin(pylist, fuzzy_flag,
				     group_size, MAX(limit, DEFAULT_LIMIT));
Yuanle Song's avatar
Yuanle Song committed
	g_debug("build_sql_for_n_pinyin result SQL:\n\n%s\n", sql);
Yuanle Song's avatar
Yuanle Song committed
	guint matched_py_length = get_matched_py_length(preedit_str, pylist, group_size);
Yuanle Song's avatar
Yuanle Song committed
	sqlite3_stmt *stmt = NULL;
	const char *unused;
	Candidate *c = NULL;
	r = sqlite3_prepare_v2(db, sql, -1, &stmt, &unused);
	g_assert_nonnull(unused);
	g_assert_cmpstr(unused, ==, "");
	if (strlen(unused)) {
		g_warning("part of sql is unused \"%s\" length=%zu",
			  unused, strlen(unused));
Yuanle Song's avatar
Yuanle Song committed
	g_free(sql);
Yuanle Song's avatar
Yuanle Song committed
		r = sqlite3_step(stmt);
		if (r == SQLITE_DONE) {
			break;
		} else if (r == SQLITE_ROW) {
Yuanle Song's avatar
Yuanle Song committed
			c = g_new0(Candidate, 1);
			/* sql SELECT should select these columns in order */
Yuanle Song's avatar
Yuanle Song committed
			c->user_freq = sqlite3_column_int(stmt, 0);
			c->str = g_strdup((const char *) sqlite3_column_text(stmt, 1));
			c->freq = sqlite3_column_int(stmt, 2);
			c->matched_py_length = matched_py_length;
			c->char_len = group_size;
Yuanle Song's avatar
Yuanle Song committed
			c->py_indices = g_malloc0(sizeof(Pinyin *) * group_size);
			for (guint i = 0; i < group_size; ++i) {
Yuanle Song's avatar
Yuanle Song committed
				c->py_indices[i] = g_new0(Pinyin, 1);
				c->py_indices[i]->shengmu_i = sqlite3_column_int(stmt, 3 + i * 2);
				c->py_indices[i]->yunmu_i = sqlite3_column_int(stmt, 4 + i * 2);
				/* we don't care about ->length field */
			}
Yuanle Song's avatar
Yuanle Song committed
			if (g_utf8_validate(c->str, -1, NULL)) {
				result = g_list_prepend(result, c);
				candidates_count++;
			} else {
Yuanle Song's avatar
Yuanle Song committed
				g_warning("ignore non utf8 phrase: %s", c->str);
			}
		} else if (r == SQLITE_BUSY) {
Yuanle Song's avatar
Yuanle Song committed
			g_warning("sqlite3_step got SQLITE_BUSY");
Yuanle Song's avatar
Yuanle Song committed
			g_warning("sqlite3_step error: %d (%s)",
				  r, sqlite3_errmsg(db));
Yuanle Song's avatar
Yuanle Song committed
	r = sqlite3_finalize(stmt);
Yuanle Song's avatar
Yuanle Song committed
		g_debug("sqlite3_finalize error: %d (%s)", r, sqlite3_errmsg(db));
	}

	/* store query result in a new GList */
Yuanle Song's avatar
Yuanle Song committed
	*candidates = g_list_reverse(result);
static void
Yuanle Song's avatar
Yuanle Song committed
add_candidate_to_builders(Candidate *c,
			  GVariantBuilder *candidates_builder,
			  GVariantBuilder *matched_lengths_builder,
			  GVariantBuilder *candidates_pinyin_indices)
Yuanle Song's avatar
Yuanle Song committed
	g_variant_builder_add(candidates_builder, "s", c->str);
	g_variant_builder_add(matched_lengths_builder, "u", c->matched_py_length);
	GVariantBuilder *py_indices_builder = NULL;
Yuanle Song's avatar
Yuanle Song committed
	py_indices_builder = g_variant_builder_new(G_VARIANT_TYPE("a(ii)"));
	for (guint i = 0; i < c->char_len; ++i) {
Yuanle Song's avatar
Yuanle Song committed
		g_variant_builder_add(
			py_indices_builder, "(ii)",
			c->py_indices[i]->shengmu_i,
			c->py_indices[i]->yunmu_i);
Yuanle Song's avatar
Yuanle Song committed
		g_debug("adding (ii) %d %d",
			c->py_indices[i]->shengmu_i,
			c->py_indices[i]->yunmu_i);
		g_free(c->py_indices[i]);
Yuanle Song's avatar
Yuanle Song committed
	g_debug("adding a(ii) to aa(ii)");
	g_variant_builder_add(candidates_pinyin_indices, "a(ii)",
			      py_indices_builder);
	g_variant_builder_unref(py_indices_builder);
	g_free(c->str);
	g_free(c->py_indices);
/**
 * convert zero FuzzyFlag to libpyzy flag.
 * I don't use libpyzy flag directly because it is overly complex.
 */
guint
to_pyzy_flag(const guint fuzzy_flag)
{
	/* ue to ve is enabled by default. */
	guint result = PINYIN_CORRECT_UE_TO_VE;
	if (fuzzy_flag & FUZZY_FLAG_ZCS_ZHCHSH) {
		result = result |
			PINYIN_FUZZY_Z_ZH | PINYIN_FUZZY_ZH_Z |
			PINYIN_FUZZY_C_CH | PINYIN_FUZZY_CH_C |
			PINYIN_FUZZY_S_SH | PINYIN_FUZZY_SH_S;
	}
	if (fuzzy_flag & FUZZY_FLAG_L_N) {
		result = result | PINYIN_FUZZY_L_N | PINYIN_FUZZY_N_L;
	}
	/* always enable incomplete pinyin support */
	return result | PINYIN_INCOMPLETE_PINYIN;
Yuanle Song's avatar
Yuanle Song committed
get_candidates(sqlite3 *db,
	       const char *preedit_str,
	       const guint fetch_size,
	       const guint fuzzy_flag,
Yuanle Song's avatar
Yuanle Song committed
	       GVariantBuilder *candidates_builder,
	       GVariantBuilder *matched_lengths_builder,
	       GVariantBuilder *candidates_pinyin_indices)
Yuanle Song's avatar
Yuanle Song committed
		g_warning("No db connection, can't get candidates.");
Yuanle Song's avatar
Yuanle Song committed
	GList *pylist = NULL;
	g_debug("fuzzy_flag=%u", fuzzy_flag);
	pylist = parse_pinyin(preedit_str, 15, to_pyzy_flag(fuzzy_flag));
Yuanle Song's avatar
Yuanle Song committed
	pylist_len = g_list_length(pylist);

	guint group_size = pylist_len;
	guint fetched_size = 0;
	guint r = 0;
Yuanle Song's avatar
Yuanle Song committed
	GList *candidates = NULL;
	while (fetched_size < fetch_size && group_size > 0) {
Yuanle Song's avatar
Yuanle Song committed
		g_info("phrase length=%u", group_size);
		r = get_candidates_for_n_pinyin(db, preedit_str, pylist, fuzzy_flag, group_size, fetch_size - fetched_size, &candidates);
Yuanle Song's avatar
Yuanle Song committed
			GList *iter = g_list_first(candidates);
			Candidate *c = NULL;
Yuanle Song's avatar
Yuanle Song committed
				c = (Candidate *) iter->data;
				add_candidate_to_builders(
					c, candidates_builder,
					matched_lengths_builder,
					candidates_pinyin_indices);
Yuanle Song's avatar
Yuanle Song committed
			g_list_free_full(candidates, g_free);
Yuanle Song's avatar
Yuanle Song committed
		g_info("%u candidates found", r);
		fetched_size += r;
		group_size--;
	}
Yuanle Song's avatar
Yuanle Song committed
	g_info("returning %u candidates", fetched_size);
Yuanle Song's avatar
Yuanle Song committed
	g_list_free_full(pylist, g_free);

/**
 * sub function for commit_candidate()
 */
static void
Yuanle Song's avatar
Yuanle Song committed
_update_userdb_py_phrase(sqlite3 *db,
			 const gchar *candidate,
			 GVariant *candidate_pinyin_indices,
			 guint len) /* utf8 length of candidate char */
{
	GString *sql = NULL;
	GVariantIter iter = {0};
	GVariant *child = NULL;
	gint x = 0;
	gint y = 0;
	guint count = 0;
	char *s = NULL;
	gboolean rb = FALSE;

Yuanle Song's avatar
Yuanle Song committed
	g_assert_nonnull(db);
	g_assert_nonnull(candidate);
	g_assert_nonnull(candidate_pinyin_indices);

	/* insert candidate maybe */

Yuanle Song's avatar
Yuanle Song committed
	sql = g_string_new(NULL);
	g_string_append_printf(sql, "INSERT OR IGNORE INTO userdb.py_phrase_%u (user_freq, phrase, freq", len - 1);
	gchar *s_y_fields = build_s_y_fields(len);
	sql = g_string_append(sql, s_y_fields);
	g_free(s_y_fields);
	s = sqlite3_mprintf(") VALUES (0, %Q, 0", candidate);
	sql = g_string_append(sql, s);
	sqlite3_free(s);
	/* iter over GVariant "a(ii)" */
Yuanle Song's avatar
Yuanle Song committed
	g_variant_iter_init(&iter, candidate_pinyin_indices);
	count = 0;
Yuanle Song's avatar
Yuanle Song committed
	while ((child = g_variant_iter_next_value(&iter))) {
		g_variant_get(child, "(ii)", &x, &y);
		g_string_append_printf(sql, ", %d, %d", x, y);
		count++;
	}
	if (count != len) {
Yuanle Song's avatar
Yuanle Song committed
		g_warning("candidate length=%u, a(ii) length=%u, mismatch!",
			  len, count);
		g_string_free(sql, TRUE);
		g_assert_not_reached();
Yuanle Song's avatar
Yuanle Song committed
	g_string_append_printf(sql, ");");
	rb = sqlite3_exec_simple(db, sql->str);
	if (! rb) {
Yuanle Song's avatar
Yuanle Song committed
		g_warning("INSERT candidate to userdb failed");
	} else {
Yuanle Song's avatar
Yuanle Song committed
		if (sqlite3_changes(db) == 1) {
Yuanle Song's avatar
Yuanle Song committed
			g_debug("candidate %s inserted to userdb", candidate);
Yuanle Song's avatar
Yuanle Song committed
	g_string_free(sql, TRUE);

	/* increment user_freq field for candidate */

Yuanle Song's avatar
Yuanle Song committed
	sql = g_string_new(NULL);
	g_string_append_printf(sql, "UPDATE userdb.py_phrase_%u "
			       "SET user_freq = user_freq + 1 ", len - 1);
	s = sqlite3_mprintf("WHERE phrase = %Q ", candidate);
	sql = g_string_append(sql, s);
	sqlite3_free(s);
	g_variant_iter_init(&iter, candidate_pinyin_indices);
	count = 0;
Yuanle Song's avatar
Yuanle Song committed
	while ((child = g_variant_iter_next_value(&iter))) {
		g_variant_get(child, "(ii)", &x, &y);
		g_string_append_printf(sql, "AND s%d=%d AND y%d=%d ",
				       count, x, count, y);
		count++;
	}
Yuanle Song's avatar
Yuanle Song committed
	sql = g_string_append(sql, ";");
	rb = sqlite3_exec_simple(db, sql->str);
	if (! rb) {
Yuanle Song's avatar
Yuanle Song committed
		g_warning("UPDATE candidate user_freq failed");
	} else {
Yuanle Song's avatar
Yuanle Song committed
		if (sqlite3_changes(db) == 1) {
Yuanle Song's avatar
Yuanle Song committed
			g_info("candidate %s user_freq incremented", candidate);
		} else {
Yuanle Song's avatar
Yuanle Song committed
			g_warning("UPDATE candidate user_freq failed, no match");
Yuanle Song's avatar
Yuanle Song committed
	g_string_free(sql, TRUE);
Yuanle Song's avatar
Yuanle Song committed
_update_userdb_not_phrase(sqlite3 *db,
			  const gchar *candidate)
Yuanle Song's avatar
Yuanle Song committed
	g_assert_nonnull(db);
	g_assert_nonnull(candidate);
	gboolean rb = FALSE;
Yuanle Song's avatar
Yuanle Song committed
	char *sql = sqlite3_mprintf("DELETE FROM userdb.not_phrase WHERE phrase = %Q;", candidate);
	rb = sqlite3_exec_simple(db, sql);
	if (! rb) {
Yuanle Song's avatar
Yuanle Song committed
		g_warning("DELETE candidate from not_phrase failed");
	} else {
Yuanle Song's avatar
Yuanle Song committed
		if (sqlite3_changes(db) == 1) {
Yuanle Song's avatar
Yuanle Song committed
			g_debug("candidate %s removed from not_phrase", candidate);
Yuanle Song's avatar
Yuanle Song committed
	sqlite3_free(sql);
Yuanle Song's avatar
Yuanle Song committed
commit_candidate(sqlite3 *db,
		 const gchar *candidate,
		 GVariant *candidate_pinyin_indices)
{
	if (! db) {
Yuanle Song's avatar
Yuanle Song committed
		g_warning("No db connection, can't commit candidates.");
		return;
	}
	if (! candidate) {
Yuanle Song's avatar
Yuanle Song committed
		g_warning("candidate should not be NULL. won't commit candidate.");
		return;
	}
	if (! candidate_pinyin_indices) {
Yuanle Song's avatar
Yuanle Song committed
		g_warning("candidate_pinyin_indices should not be NULL. won't commit candidate.");
Yuanle Song's avatar
Yuanle Song committed
	guint len = g_utf8_strlen(candidate, -1);
	if (len <= 1) {
Yuanle Song's avatar
Yuanle Song committed
		g_info("commit single character %s is a no-op", candidate);
Yuanle Song's avatar
Yuanle Song committed
	_update_userdb_py_phrase(db, candidate, candidate_pinyin_indices, len);
	_update_userdb_not_phrase(db, candidate);