diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c index f7e96acc53..306d60bd3b 100644 --- a/contrib/pg_trgm/trgm_op.c +++ b/contrib/pg_trgm/trgm_op.c @@ -456,7 +456,7 @@ iterate_word_similarity(int *trg2indexes, lastpos[trgindex] = i; } - /* Adjust lower bound if this trigram is present in required substring */ + /* Adjust upper bound if this trigram is present in required substring */ if (found[trgindex]) { int prev_lower, @@ -473,7 +473,7 @@ iterate_word_similarity(int *trg2indexes, smlr_cur = CALCSML(count, ulen1, ulen2); - /* Also try to adjust upper bound for greater similarity */ + /* Also try to adjust lower bound for greater similarity */ tmp_count = count; tmp_ulen2 = ulen2; prev_lower = lower; diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml index 338ef30fbc..fb5beb9272 100644 --- a/doc/src/sgml/pgtrgm.sgml +++ b/doc/src/sgml/pgtrgm.sgml @@ -99,12 +99,8 @@ real - Returns a number that indicates how similar the first string - to the most similar word of the second string. The function searches in - the second string a most similar word not a most similar substring. The - range of the result is zero (indicating that the two strings are - completely dissimilar) to one (indicating that the first string is - identical to one of the words of the second string). + Returns greatest similarity between trigrams set of the first string and + any continuous extent of ordered trigrams set of the second string. @@ -131,6 +127,35 @@ + + word_similarity(text, text) requires further + explanation. Consider following example. + + +# select word_similarity('word', 'two words'); + word_similarity +----------------- + 0.8 +(1 row) + + + First string set of trigrams is + {" w"," wo","ord","wor","rd "}. + Second string ordered set of trigrams is + {" t"," tw",two,"wo "," w"," wo","wor","ord","rds", ds "}. + The most similar extent of second string ordered set of trigrams is + {" w"," wo","wor","ord"}, and the similarity is + 0.8. + + + + This function can be approximately understood as greatest similarity between + first string and any substring of the second string. However, this function + doesn't add paddings to the boundaries of extent. This is why this function + is scoring full-word matching more than word to part of word matching. This + specialty finds its reflection in the function, quite ambiguous though. + + <filename>pg_trgm</filename> Operators @@ -156,9 +181,9 @@ text <% text boolean - Returns true if its first argument has the similar word in - the second argument and they have a similarity that is greater than the - current word similarity threshold set by + Returns true if its second argument has continuous + extent of ordered trigrams set which similarity to first argument + trigram set is greater than the current word similarity threshold set by pg_trgm.word_similarity_threshold parameter. @@ -302,8 +327,9 @@ SELECT t, word_similarity('word', t) AS sml WHERE 'word' <% t ORDER BY sml DESC, t; - This will return all values in the text column that have a word - which sufficiently similar to word, sorted from best + This will return all values in the text column that have an continuous extent + in corresponding ordered trigram set which sufficiently similar to + trigram set of word, sorted from best match to worst. The index will be used to make this a fast operation even over very large data sets.