From f681315d43c0d7f50ec6127708d6c490baf55be1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20van=20Eeden?= Date: Sun, 13 Dec 2015 17:13:44 +0100 Subject: [PATCH] Add parser which indexes words in Unicode NFC normalized form. When the text contains both normalized and non-normalized words (e.g combining characters for e+" instead of e). Then this parser will index the normalized form and will return results for both if the search key is normalized. --- plugin/fulltext/CMakeLists.txt | 8 ++ plugin/fulltext/norm_parser/plugin_norm.c | 127 ++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 plugin/fulltext/norm_parser/plugin_norm.c diff --git a/plugin/fulltext/CMakeLists.txt b/plugin/fulltext/CMakeLists.txt index a0c88b8..294eed5 100644 --- a/plugin/fulltext/CMakeLists.txt +++ b/plugin/fulltext/CMakeLists.txt @@ -17,6 +17,14 @@ MYSQL_ADD_PLUGIN(ftexample ./example/plugin_example.c MODULE_ONLY MODULE_OUTPUT_NAME "mypluglib") +# Norm parser +MYSQL_ADD_PLUGIN(norm_parser ./norm_parser/plugin_norm.c + MODULE_ONLY MODULE_OUTPUT_NAME "norm_parser") +TARGET_LINK_LIBRARIES(norm_parser icuio) +TARGET_LINK_LIBRARIES(norm_parser icui18n) +TARGET_LINK_LIBRARIES(norm_parser icuuc) +TARGET_LINK_LIBRARIES(norm_parser icudata) + # Ngram parser INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include ${CMAKE_SOURCE_DIR}/include) diff --git a/plugin/fulltext/norm_parser/plugin_norm.c b/plugin/fulltext/norm_parser/plugin_norm.c new file mode 100644 index 0000000..60607af --- /dev/null +++ b/plugin/fulltext/norm_parser/plugin_norm.c @@ -0,0 +1,127 @@ +/* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include "my_config.h" +#include +#include +#include +#include +#include "unicode/unorm2.h" +#include "unicode/ustring.h" + +static int norm_parser_plugin_init(void *arg __attribute__((unused))) +{ + return(0); +} + +static int norm_parser_plugin_deinit(void *arg __attribute__((unused))) +{ + return(0); +} + +static int norm_parser_init(MYSQL_FTPARSER_PARAM *param + __attribute__((unused))) +{ + return(0); +} + +static int norm_parser_deinit(MYSQL_FTPARSER_PARAM *param + __attribute__((unused))) +{ + return(0); +} + +static void add_word(MYSQL_FTPARSER_PARAM *param, char *word, size_t len) +{ + const UNormalizer2 *icu_unorm; + UChar icu_src[1000]; + UChar icu_dst[1000]; + char norm_word[1000]; + UErrorCode err = U_ZERO_ERROR; + MYSQL_FTPARSER_BOOLEAN_INFO bool_info= + { FT_TOKEN_WORD, 0, 0, 0, 0, (word - param->doc), ' ', 0 }; + + u_strFromUTF8(icu_src, 1000, NULL, word, (int32_t)len, &err); + if (U_FAILURE(err)) { + fprintf(stderr, "u_strFromUTF8() Failed! err:%s ", u_errorName(err)); + } + + icu_unorm = unorm2_getNFCInstance(&err); + if (U_FAILURE(err)) { + fprintf(stderr, "unorm2_getNFCInstance() Failed! err:%s ", u_errorName(err)); + } + + unorm2_normalize(icu_unorm,icu_src,len,icu_dst,1000,&err); + if (U_FAILURE(err)) { + fprintf(stderr, "unorm2_normalize() Failed! err:%s ", u_errorName(err)); + } + + u_strToUTF8(norm_word, 1000, NULL, icu_dst, len, &err); + if (U_FAILURE(err)) { + fprintf(stderr, "u_strToUTF8() Failed! err:%s ", u_errorName(err)); + } + + if (norm_word != NULL) { + param->mysql_add_word(param, norm_word, len, &bool_info); + } +} + +static int norm_parser_parse(MYSQL_FTPARSER_PARAM *param) +{ + char *end, *start, *docend= param->doc + param->length; + + for (end= start= param->doc;; end++) + { + if (end == docend) + { + if (end > start) + add_word(param, start, end - start); + break; + } + else if (my_isspace(param->cs, *end)) + { + if (end > start) + add_word(param, start, end - start); + start= end + 1; + } + } + return(0); +} + +static struct st_mysql_ftparser norm_parser_descriptor= +{ + MYSQL_FTPARSER_INTERFACE_VERSION, + norm_parser_parse, + norm_parser_init, + norm_parser_deinit +}; + +mysql_declare_plugin(ftnorm) +{ + MYSQL_FTPARSER_PLUGIN, + &norm_parser_descriptor, + "norm_parser", + "Daniƫl van Eeden", + "Normalized Unicode Parser", + PLUGIN_LICENSE_GPL, + norm_parser_plugin_init, + norm_parser_plugin_deinit, + 0x0001, + NULL, + NULL, + NULL, + 0, +} +mysql_declare_plugin_end;