...

вторник, 17 декабря 2013 г.

Нечеткий динамический текстовый поиск? Не так уж и страшно

#include

#include

#include

#include <libutil.h>


#ifdef WIN32

#include <crtdbg.h>

#endif


#include «sqlnode.h»

#include «sqlbif.h»

#include «wi.h»

#include «Dk.h»


#include <math.h>

#include «caseutils.h»

#include «list_sort.h»

#include <assert.h>


//#include <ksrvext.h>


static id_hash_t * ht_dict_ = NULL;

static dk_hash_t * ht_dict_by_id_ = NULL;

static id_hash_t * ht_triples_ = NULL;

static dk_mutex_t * dict_mtx_ = NULL;


struct dict_item_s {

char *word_;

size_t id_;

size_t count_;

size_t attr_;

};

typedef struct dict_item_s dict_item_t;


struct triple_item_s {

size_t wordid_;

struct triple_item_s *next_;

};

typedef struct triple_item_s triple_item_t;


struct triple_head_s {

lenmem_t lm_;

wchar_t data_[4];

triple_item_t *list_;

};

typedef struct triple_head_s triple_head_t;


const wchar_t seps_[] = L" ,.\t\r\n'\"=*!%^:;~`<>+|?";

const wchar_t glues_[] = L"()&@#$:{}/\\-[]_";


size_t next_wordid (caddr_t * qst)

{

size_t _id = 0;

query_instance_t *q = (query_instance_t *)qst;

client_connection_t *cli = q->qi_client;

query_t *stmt = NULL;

local_cursor_t *lc = NULL;

caddr_t lerr = NULL;

caddr_t * err = &lerr;

char buf[1024];

sprintf (buf, «select sequence_next ('MRC_WORD_ID')»);

if (NULL == (stmt = sql_compile (buf, cli, err, 0)))

goto end;


if (NULL != (*err =

qr_rec_exec (stmt, cli, &lc, (query_instance_t *) qst, NULL,

0)))

goto end;


end:

if (lc)

{

lc_next (lc);

_id = (size_t)unbox (lc_nth_col (lc, 0));

}

if (lc)

{

lc_free (lc);

lc = NULL;

}

if (stmt)

{

qr_free (stmt);

stmt = NULL;

}

return _id;

}


size_t store_triple (caddr_t * qst, size_t wordid, const wchar_t *triple)

{

query_instance_t *q = (query_instance_t *)qst;

client_connection_t *cli = q->qi_client;

query_t *stmt = NULL;

local_cursor_t *lc = NULL;

caddr_t lerr = NULL;

caddr_t * err = &lerr;

char buf[1024];

wchar_t wlt[4];

wcsncpy(wlt, triple, 3);

wlt[3] = 0;


sprintf (buf, "--utf8_execs=yes\n insert into MRC_TRIPLES (TR_DATA, TR_WORDID) values(?,?)");

if (NULL == (stmt = sql_compile (buf, cli, err, 0)))

goto end;


if (NULL != (*err =

qr_rec_exec (stmt, cli, &lc, (query_instance_t *) qst, NULL, 2,

":0", box_wide_string (wlt), QRP_RAW,

":1", box_num(wordid), QRP_RAW

)))

goto end;

{

char**place = NULL;


triple_item_t *pitem = (triple_item_t *)dk_alloc_box_zero(sizeof(triple_item_t), DV_BIN);

triple_head_t *phead = (triple_head_t *)dk_alloc_box_zero(sizeof(triple_head_t), DV_BIN);


phead->lm_.lm_length = sizeof(phead->data_);

phead->lm_.lm_memblock = (char*)phead->data_;

memcpy(phead->data_, wlt, sizeof(phead->data_));


pitem->wordid_ = wordid;


place = (char **) id_hash_get (ht_triples_, (caddr_t) &phead->lm_);

if (place)

{

triple_head_t *ohead = *(triple_head_t**)place;

pitem->next_ = ohead->list_;

ohead->list_ = pitem;

}

else

{

id_hash_set (ht_triples_, (caddr_t)(&phead->lm_), (caddr_t)&phead);

pitem->next_ = pitem;

}

}


end:

if (lc)

{

lc_free (lc);

lc = NULL;

}

if (stmt)

{

qr_free (stmt);

stmt = NULL;

}

return 0;

}


wchar_t **

nv_split (wchar_t *tmp)

{

wchar_t **arr = NULL;

size_t len = wcslen (tmp);

size_t ix = 0;

size_t i;

size_t cnt = 0;

for (i = 0; i < len; i++)

{

if (NULL != wcschr (seps_, tmp[i]))

{

tmp[ix++] = 0;

cnt++;

}

else

{

if (NULL == wcschr (glues_, tmp[i]))

tmp[ix++] = mrc_toupper (tmp[i]);

}

}

tmp[ix] = 0;

cnt = 0;

for (i = 0; i < len; i++)

{

if (tmp[i])

{

cnt++;

while (i < len && 0 != tmp[++i]);

}

}

if (cnt)

{

/* And allocate a vector of once or twice of that many elements. */

arr = dk_alloc_box ((cnt * sizeof (caddr_t)), DV_ARRAY_OF_POINTER);


ix = 0;

for (i = 0; i < len; i++)

{

if (0 != tmp[i])

{

int loclen = wcslen(tmp+i);

((caddr_t *) arr)[ix] = ((char *) dk_alloc_box_zero ((loclen + 1)*sizeof(wchar_t), DV_LONG_WIDE));

memcpy (((caddr_t *) arr)[ix++], tmp + i, (loclen + 1)*sizeof(wchar_t));

while (i < len && 0 != tmp[++i]);

}

}

}

return arr;

}


caddr_t

bif_nv_split (caddr_t * qst, caddr_t * err_ret, state_slot_t ** args)

{

char *me = «nv_split»;

caddr_t arr = NULL;

caddr_t arg = bif_arg_unrdf (qst, args, 0, me);

dtp_t dtp = DV_TYPE_OF (arg);

if (DV_DB_NULL == dtp || NULL == arg)

{

return (NULL);

}

if (IS_STRING_DTP(dtp))

{

wchar_t *wide = box_utf8_as_wide_char (arg, NULL, strlen(arg), 0, DV_WIDE);

arr = (caddr_t)nv_split (wide);

dk_free_box(wide);

return arr;

}

if (IS_WIDE_STRING_DTP (dtp))

{

wchar_t *tmp = wcsdup ((const wchar_t *)arg);

arr = (caddr_t)nv_split (tmp);

free(tmp);

return arr;

}


{

sqlr_new_error («22023», «SR007»,

«Function %s needs a nvstring or NULL as argument, „

“not an arg of type %s (%d)»,

me, 1, dv_type_title (dtp), dtp);

}

return arr;

}


caddr_t

bif_treat_nword (caddr_t * qst, caddr_t * err_ret, state_slot_t ** args)

{

char *me = «treat_nword»;

caddr_t arg = bif_arg_unrdf (qst, args, 0, me);

int len;

const wchar_t *wide = (const wchar_t *)arg;

const wchar_t *newwide = NULL;

wchar_t *tmpbuf;

size_t wordid = 0;

dtp_t dtp = DV_TYPE_OF (arg);

if (DV_DB_NULL == dtp)

{

return (NULL);

}

if (!IS_WIDE_STRING_DTP (dtp))

{

sqlr_new_error («22023», «SR007»,

«Function %s needs a nvstring or NULL as argument, „

“not an arg of type %s (%d)»,

me, 1, dv_type_title (dtp), dtp);

}

len = wcslen(wide);

tmpbuf = (wchar_t *)_alloca (sizeof (wchar_t) * (len + 3));

tmpbuf[0] = L' ';

wcscpy(tmpbuf + 1, wide);

tmpbuf[len+1] = L' ';

tmpbuf[len+2] = 0;

newwide = tmpbuf;


mutex_enter (dict_mtx_);

{

char*utf8 = box_wide_as_utf8_char ((const char*)wide, len, DV_LONG_STRING);

char**place = NULL;


place = (char **) id_hash_get (ht_dict_, (caddr_t) &utf8);

if (place)

{

dict_item_t *pitem = *(dict_item_t **)place;

pitem->count_++;

dk_free_box(utf8);

wordid = pitem->id_;

}

else

{

query_instance_t *q = (query_instance_t *)qst;

client_connection_t *cli = q->qi_client;

query_t *stmt = NULL;

local_cursor_t *lc = NULL;

caddr_t lerr = NULL;

caddr_t * err = &lerr;

char buf[1024];


dict_item_t *pitem = dk_alloc_box_zero (sizeof(dict_item_t), DV_BIN);

pitem->word_ = utf8;

pitem->count_ = 1;

wordid = next_wordid(qst);

pitem->id_ = wordid;

id_hash_set (ht_dict_, (caddr_t) &utf8, (caddr_t) &pitem);

sethash ((void *)wordid, ht_dict_by_id_, (void*)pitem);


sprintf (buf, "--utf8_execs=yes\n insert into MRC_WORDS(WD_ITSELF, WD_ID) values (?, ?)");

//cast(charset_recode ('%s', 'UTF-8', '_WIDE_') as nvarchar))", wordid, utf8);

if (NULL != (stmt = sql_compile (buf, cli, err, 0)))

{

*err =

qr_rec_exec (stmt, cli, &lc, (query_instance_t *) qst, NULL, 2,

":0", box_wide_string (newwide), QRP_RAW,

":1", box_num(wordid), QRP_RAW

);

//*err = qr_rec_exec (stmt, cli, &lc, (query_instance_t *) qst, NULL, 0);

}

if (lc)

lc_free (lc);

if (stmt)

qr_free (stmt);


{

int len = wcslen(newwide);

int i;

for (i = 0; i < len-2; i++)

{

store_triple (qst, wordid, newwide + i);

}

}

}

}

mutex_leave (dict_mtx_);

return box_num(wordid);

}


int64

box2long (caddr_t arg)

{

dtp_t dtp = DV_TYPE_OF (arg);

if (dtp == DV_SHORT_INT || dtp == DV_LONG_INT)

return (int64)(unbox (arg));

else if (dtp == DV_SINGLE_FLOAT)

return (int64)(unbox_float (arg));

else if (dtp == DV_DOUBLE_FLOAT)

return (int64)(unbox_double (arg));

else if (dtp == DV_NUMERIC)

{

int64 dt;

numeric_to_int64 ((numeric_t) arg, &dt);

return dt;

}

else if (dtp == DV_DB_NULL)

return (int64)(0);

assert (0);

return 0;

}


void flush_dict()

{

char **key = NULL;

char **val = NULL;

id_hash_iterator_t hit;

id_hash_iterator (&hit, ht_dict_);

while (hit_next (&hit, (caddr_t*) &key, (caddr_t*) &val))

{

dk_free_box(*key);

dk_free_box(*val);

}

id_hash_clear(ht_dict_);

}


void flush_triples()

{

char **key = NULL;

char **val = NULL;

id_hash_iterator_t hit;

id_hash_iterator (&hit, ht_triples_);

while (hit_next (&hit, (caddr_t*) &key, (caddr_t*) &val))

{

triple_head_t *phead = *(triple_head_t**)val;

triple_item_t *pit = phead->list_;


while (pit)

{

triple_item_t *tmp = pit->next_;

dk_free_box (pit);

pit = tmp;

}

dk_free_box(*val);

}

id_hash_clear(ht_triples_);

}


size_t reload_triples (query_instance_t *qst)

{

client_connection_t *cli = qst->qi_client;

query_t *stmt = NULL;

local_cursor_t *lc = NULL;

caddr_t lerr = NULL;

caddr_t * err = &lerr;

char buf[1024];


flush_triples ();


sprintf (buf, «select TR_DATA, TR_WORDID from MRC_TRIPLES»);

if (NULL != (stmt = sql_compile (buf, cli, err, 0)))

{

*err = qr_rec_exec (stmt, cli, &lc, (query_instance_t *) qst, NULL, 0);

if (lc)

{

int64 id = 0;

caddr_t tmp = 0;

int64 cnt = 0;

char*utf8 = NULL;

char**place = NULL;

lenmem_t lm;

triple_head_t *phead = NULL;

triple_head_t *ohead = NULL;

triple_item_t *pitem = NULL;


while (lc_next (lc))

{

if (lc->lc_error)

{

*err = box_copy_tree (lc->lc_error);

break;

}

id = box2long (lc_nth_col (lc, 1));

tmp = lc_nth_col (lc, 0);


pitem = (triple_item_t *)dk_alloc_box_zero(sizeof(triple_item_t), DV_BIN);

pitem->wordid_ = (size_t)id;


lm.lm_length = sizeof (phead->data_);

lm.lm_memblock = (caddr_t)tmp;


place = (char **) id_hash_get (ht_triples_, (caddr_t) &lm);

if (place)

{

ohead = *(triple_head_t **)place;

pitem->next_ = ohead->list_;

ohead->list_ = pitem;

}

else

{

phead = (triple_head_t *)dk_alloc_box_zero(sizeof(triple_head_t), DV_BIN);

phead->list_ = pitem;

phead->lm_.lm_length = sizeof (phead->data_);

phead->lm_.lm_memblock = (caddr_t)phead->data_;

memcpy(phead->data_, tmp, sizeof (phead->data_));


pitem->next_ = NULL;

id_hash_set (ht_triples_, (caddr_t) &phead->lm_, (caddr_t) &phead);

}

}

}

}

if (lc)

lc_free (lc);

if (stmt)

qr_free (stmt);


return 0;

}


caddr_t

bif_reload_dict (caddr_t * qst, caddr_t * err_ret, state_slot_t ** args)

{

query_instance_t *q = (query_instance_t *)qst;

client_connection_t *cli = q->qi_client;

query_t *stmt = NULL;

local_cursor_t *lc = NULL;

caddr_t lerr = NULL;

caddr_t * err = &lerr;

char buf[1024];


mutex_enter (dict_mtx_);

flush_dict();


sprintf (buf, «select WD_ID, WD_ITSELF, WD_COUNT from MRC_WORDS»);

if (NULL != (stmt = sql_compile (buf, cli, err, 0)))

{

*err = qr_rec_exec (stmt, cli, &lc, (query_instance_t *) qst, NULL, 0);

if (lc)

{

int64 id = 0;

caddr_t tmp = 0;

int64 cnt = 0;

char*utf8 = NULL;

char**place = NULL;

size_t maxid = 0;


while (lc_next (lc))

{

if (lc->lc_error)

{

*err = box_copy_tree (lc->lc_error);

break;

}

id = box2long (lc_nth_col (lc, 0));

tmp = lc_nth_col (lc, 1);

cnt = box2long (lc_nth_col (lc, 2));

utf8 = box_wide_as_utf8_char (tmp, box_length (tmp) / sizeof (wchar_t) — 1, DV_LONG_STRING);


place = (char **) id_hash_get (ht_dict_, (caddr_t) &utf8);

if (place)

{

assert(0);

}

else

{

dict_item_t *pitem = dk_alloc_box_zero (sizeof(dict_item_t), DV_BIN);

pitem->word_ = utf8;

pitem->count_ = 1;

pitem->id_ = (size_t)id;

if (maxid < id)

maxid = id;

id_hash_set (ht_dict_, (caddr_t) &utf8, (caddr_t) &pitem);

sethash ((void *)id, ht_dict_by_id_, (void*)pitem);

}

}

}

}

if (lc)

lc_free (lc);

if (stmt)

qr_free (stmt);


reload_triples(q);


mutex_leave (dict_mtx_);

return 0;

}


//---------------------------------------------------------------------------

// l_dist_raw()

// static/local function!!!

//

// Purpose: Calculates the L Distance for the two strings (words).

//

// Inputs: char *str1, *str2 — input strings (words) to compair

// int len1,len2 — the shorter of the length of str1 amd str2

// respectively or MAX_LDIST_LEN.

// NOTE! No error checking is done.

// Array overflow on the stack will result

// if either is out of range.

// Outputs: none

//

// Returns: L Distance value is returned

//

// Note, there are two defines immediately after this comment header that

// are only used by this function.

//

// (values in all CAPS are defined in the LDIST.H header file)

//

//---------------------------------------------------------------------------

#define MAX_LDIST_LEN 40 // max word len to compair

#define MIN3(a,b,c) (a < b? \

(a < c? a: c): \

(b < c? b: c))


int

l_dist_raw(const wchar_t *str1, const wchar_t *str2, int len1, int len2)

{

int arr1[MAX_LDIST_LEN+1];

int arr2[MAX_LDIST_LEN+1];

int i, j;

if (len1 > MAX_LDIST_LEN)

len1 = MAX_LDIST_LEN;

if (len2 > MAX_LDIST_LEN)

len2 = MAX_LDIST_LEN;

for (i = 0; i <= len2; i++)

arr1[i] = i;


for (i = 1; i <= len1; i++)

{

arr2[0] = i;

for (j = 1; j <= len2; j++)

{

int score = (str1[i-1] == str2[j-1])?0:1;

int i1 = arr2[j-1]+1;

int i2 = arr1[j]+1;

int i3 = arr1[j-1] + score;

arr2[j] = MIN3 (i1, i2, i3);//arr2[j-1]+1, arr1[j]+1, arr1[j] + score);

//d[(j-1)*n+i]+1, d[j*n+i-1]+1, d[(j-1)*n+i-1]+cost);

}

memcpy (arr1, arr2, sizeof (int)*(len2+1));

}

return arr2[len2];

}


struct ipair_s {

ptrlong id_;

ptrlong len_;

ptrlong pos_;

ptrlong score_;

};

typedef struct ipair_s ipair_t;


int

cmp_pairs (const void *a,const void *b)

{

const ipair_t *pa = *(const ipair_t **)a;

const ipair_t *pb = *(const ipair_t **)b;

if (pb->id_ == pa->id_)

return pa->score_ — pb->score_;

return pa->id_ — pb->id_;

}


int compare_by_id(const void *a, const void *b, const void *arg)

{

ipair_t *pa = (ipair_t*)a;

ipair_t *pb = (ipair_t*)b;

return pa->id_ — pb->id_;

}


int compare_by_score(const void *a, const void *b, const void *arg)

{

ipair_t *pa = (ipair_t*)a;

ipair_t *pb = (ipair_t*)b;

return pb->score_ — pa->score_;

}


dk_set_t

load_oid_list (ipair_t **words, query_instance_t *q, mem_pool_t * mp)

{

client_connection_t *cli = q->qi_client;

/*static*/ query_t *stmt = NULL;

local_cursor_t *lc = NULL;

caddr_t lerr = NULL;

caddr_t * err = &lerr;

dk_set_t out_list = NULL;

char buf[1024];

dk_set_t pairs_list = NULL;

ipair_t *item = NULL;

size_t i = 0;

size_t len = box_length(words)/sizeof(ipair_t*);

size_t cnt = 0;


if (NULL == stmt)

{

sprintf (buf, «select DT_OID, DT_POSITION from MRC_DATA where DT_WORDID =? „);

if (NULL == (stmt = sql_compile_static (buf, /*bootstrap_*/cli, err, 0)))

return NULL;

}

for (i = 0; i< len; i++)

{

size_t id;

ipair_t *pair = words[i];

//printf(“\n---%d -----\n»,pair->id_);

*err = qr_rec_exec (stmt, cli, &lc, (query_instance_t *) q, NULL, 1,

":0", box_num (pair->id_), QRP_RAW);

if (NULL == lc)

continue;


while (lc_next (lc))

{

if (lc->lc_error)

{

*err = box_copy_tree (lc->lc_error);

break;

}

id = box2long (lc_nth_col (lc, 0));

item = (ipair_t*)mp_alloc_box(mp, sizeof(ipair_t), DV_ARRAY_OF_LONG);

item->id_ = id;

item->len_ = pair->len_;

item->pos_ = box2long (lc_nth_col (lc, 1));

item->score_ = pair->score_;

mp_set_push (mp, &pairs_list, item);

cnt++;

// printf("%d ", id);

}

if (lc)

lc_free (lc);

}

if (stmt)

qr_free (stmt);


//if (stmt)

// qr_free (stmt);

//printf("+%d+", cnt);

return list_sort (pairs_list, compare_by_id, NULL);

}


ipair_t **

get_word_candidates (wchar_t *arg)

{

ipair_t **res = NULL;

dk_set_t ids_list = NULL;

caddr_t arr = NULL;


{

dk_hash_t *ht_ids = NULL;

int maxcount = 1;

size_t i;


wchar_t *word = (wchar_t*)arg;

wchar_t *pbuf = NULL;

size_t isnum = ((*word) >= L'0' && (*word) <= L'9');

size_t len = wcslen (word);

//int slen = len;

if (len < 3 && !isnum)

{

return NULL;

}

word = (wchar_t*)box_copy (word);

mrc_toupper_str (word);


pbuf = (wchar_t *)_alloca (sizeof (wchar_t) * (len + 3));

pbuf[0] = L' ';

wcscpy (pbuf + 1, word);

pbuf[len + 1] = L' ';

pbuf[len + 2] = L'\0';


ht_ids = hash_table_allocate (101);


mutex_enter (dict_mtx_);

for (i = 0; i < (len); i++)

{

char**place = NULL;

lenmem_t lm;

triple_head_t *phead = NULL;

triple_item_t *pitem = NULL;

wchar_t trbuf[4];

trbuf[0] = mrc_toupper(pbuf[i]);

trbuf[1] = mrc_toupper(pbuf[i + 1]);

trbuf[2] = mrc_toupper(pbuf[i + 2]);

trbuf[3] = L'\0';


lm.lm_length = sizeof (phead->data_);

lm.lm_memblock = (caddr_t)trbuf;


place = (char **) id_hash_get (ht_triples_, (caddr_t) &lm);

if (place)

{

phead = *(triple_head_t**)place;

pitem = phead->list_;

while(pitem)

{

int wordid = pitem->wordid_;


int ptr = (int)gethash ((void *)wordid, ht_ids);

if (0 == ptr)

sethash ((void *)wordid, ht_ids, (void*)1);

else

{

sethash ((void *)wordid, ht_ids, (void*)(++ptr));

if (ptr > maxcount)

maxcount = ptr;

}


pitem = pitem->next_;

}

}

}

mutex_leave (dict_mtx_);


{

dk_set_t pairs_list = NULL;

int nids = 0;

int mx = maxcount;

int nallids = ht_ids->ht_count;

void *key, *val;

dk_hash_iterator_t hit;


maxcount = (maxcount + 1)/2;

if (maxcount >= len)

maxcount = len — 1;

for (dk_hash_iterator (&hit, ht_ids);

dk_hit_next (&hit, (void**) &key, (void**) &val);

/* */)

{

int wordid = (int)key;

int cnt = (int)val;

if (cnt >= maxcount)

{

dict_item_t *pptr = (dict_item_t *)gethash ((void *)wordid, ht_dict_by_id_);

if(pptr)

{

ipair_t *item = NULL;

wchar_t buf[128];

size_t lbuf, dist, score;

box_utf8_as_wide_char ((caddr_t)pptr->word_, (caddr_t)buf, strlen(pptr->word_), 127, DV_WIDE);

lbuf = wcslen(buf);

dist = l_dist_raw(word, buf, len, lbuf);

score = 100 — (dist * 100)/((len > lbuf)? len: lbuf);

//score = 100 — (dist * 200)/(len + lbuf);

if (word[0] != buf[0])

score = (score * 3)>>2;

//score = 100 — (dist * 100)/((len > lbuf)? len: lbuf);

//wprintf (L"%s -> %s (%d)\n", word, buf, score);

item = (ipair_t*)dk_alloc_box(sizeof(ipair_t), DV_ARRAY_OF_LONG);

item->id_ = wordid;

item->len_ = lbuf;

item->score_ = score;

dk_set_push (&pairs_list, item);

nids++;

}

assert(pptr);

}

}

if (pairs_list)

{

res = (ipair_t**)dk_set_to_array (pairs_list);

dk_set_free (pairs_list);

assert(nids == box_length(res)/sizeof(void*));

qsort (res, nids, sizeof (void*), cmp_pairs);

}

}

hash_table_free (ht_ids);

dk_free_box(word);

}

return res;

}


caddr_t

bif_get_word_candidates (caddr_t * qst, caddr_t * err_ret, state_slot_t ** args)

{

char *me = «get_word_candidates»;

ipair_t **res = NULL;

dk_set_t ids_list = NULL;

caddr_t arr = NULL;

caddr_t arg = bif_arg_unrdf (qst, args, 0, me);

dtp_t dtp = DV_TYPE_OF (arg);

if (DV_DB_NULL == dtp)

{

return (NULL);

}

if (!IS_WIDE_STRING_DTP (dtp))

{

sqlr_new_error («22023», «SR007»,

«Function %s needs a nvstring or NULL as argument, „

“not an arg of type %s (%d)»,

me, 1, dv_type_title (dtp), dtp);

}


if (0 == ht_dict_->ht_count)

{

bif_reload_dict (qst, err_ret, args);

}


res = get_word_candidates ((wchar_t*)arg);


ids_list = load_oid_list (res, (query_instance_t *)qst, NULL);

DO_SET (ipair_t *, item, &ids_list)

{

//printf ("%d ", item->id_);

dk_free_box(item);

}

END_DO_SET ();


return (caddr_t)res;

}


caddr_t

bif_calc_similarity (caddr_t * qst, caddr_t * err_ret, state_slot_t ** args)

{

char *me = «calc_similarity»;

caddr_t arg1 = bif_arg_unrdf (qst, args, 0, me);

caddr_t arg2 = bif_arg_unrdf (qst, args, 1, me);

dtp_t dtp1 = DV_TYPE_OF (arg1);

dtp_t dtp2 = DV_TYPE_OF (arg2);

if (DV_DB_NULL == dtp1 || DV_DB_NULL == dtp2)

{

return (NULL);

}

if ((!IS_WIDE_STRING_DTP (dtp1)) || (!IS_WIDE_STRING_DTP (dtp2)))

{

sqlr_new_error («22023», «SR007»,

«Function %s needs a nvstring or NULL as arguments, „);

}

{

wchar_t *str1 = (wchar_t*)arg1;

wchar_t *str2 = (wchar_t*)arg2;

int l1 = wcslen(str1);

int l2 = wcslen(str2);

int dist = l_dist_raw(str1, str2, l1, l2);

int score = 100 — (dist * 100)/((l1 > l2)? l1: l2);

if (str1[0] != str2[0])

score = (score * 3)>>2;

return score;

}

}

static int g_cnt = 0;

#if defined WIN32 && defined (_DEBUG)

static _CrtMemState checkPt1;

#endif


long sqrt_long(long r)

{

long t, b, c = 0;

assert (r >= 0);


for (b=0x10000000; b != 0; b >>= 2)

{

t = c + b;

c >>= 1;

if (t <= r)

{

r -= t;

c += b;

}

}

return©;

}


caddr_t

bif_query_phrase (caddr_t * qst, caddr_t * err_ret, state_slot_t ** args)

{

char *me = “query_phrase»;

wchar_t **words = NULL;

ptrlong *res = NULL;

wchar_t *tmp = NULL;

dk_set_t ids_list = NULL;

caddr_t arr = NULL;

caddr_t arg = bif_arg_unrdf (qst, args, 0, me);

dtp_t dtp = DV_TYPE_OF (arg);

int len = 0;

mem_pool_t *mp = mem_pool_alloc();


#if 0//defined WIN32 && defined (_DEBUG)

_CrtCheckMemory( );

_CrtMemCheckpoint( &checkPt1 );

#endif


//if (0 == (g_cnt%1000))

//printf ("%d ", g_cnt);

++g_cnt;

if (DV_DB_NULL == dtp)

{

return (NULL);

}

if (IS_STRING_DTP(dtp))

{

tmp = box_utf8_as_wide_char (arg, NULL, strlen(arg), 0, DV_WIDE);

words = nv_split (tmp);

dk_free_box(tmp);

}

else if (IS_WIDE_STRING_DTP (dtp))

{

tmp = wcsdup ((const wchar_t *)arg);

words = nv_split (tmp);

free(tmp);

}

else

{

sqlr_new_error («22023», «SR007»,

«Function %s needs a nvstring or NULL as argument, „

“not an arg of type %s (%d)»,

me, 1, dv_type_title (dtp), dtp);

}


if (0 == ht_dict_->ht_count)

{

bif_reload_dict (qst, err_ret, args);

}


//mutex_enter (dict_mtx_);


if (words)

{

size_t niters = box_length(words)/sizeof(void*);

dk_set_t results = NULL;

dk_set_t *iter_holders = mp_alloc_box (mp, niters * sizeof(dk_set_t), DV_ARRAY_OF_POINTER);

dk_set_t *iters = mp_alloc_box (mp, niters * sizeof(dk_set_t), DV_ARRAY_OF_POINTER);

size_t i = 0;

size_t ix = 0;

size_t cnt = 0;

size_t cnt1 = 0;

for (i = 0; i <niters; i++)

{

ipair_t **res = get_word_candidates ((wchar_t*)words[i]);

if (res)

{

iter_holders[ix] = load_oid_list (res, (query_instance_t *)qst, mp);

iters[ix] = iter_holders[ix];

ix++;

dk_free_tree (res);

}

}

niters = ix;


if (niters)

{

int64 min_elem = 0;

int fin = 0;

for (;!fin;)

{

int bfound = 1;

size_t div = 1;

size_t score = 1;

size_t sumpos = 0;

size_t oldpos = 0;


if (!iters[0])

break;

min_elem = ((ipair_t *)iters[0]->data)->id_;

for (i = 0; i <niters; i++)

{

if (iters[i])

{

ipair_t *ptr = (ipair_t *)iters[i]->data;

div *= 100;

score *= ptr->score_;

if (i)

{

sumpos += abs (oldpos — ptr->pos_);

}

oldpos = ptr->pos_;

if (ptr->id_ != min_elem)

{

bfound = 0;

}

if (ptr->id_ < min_elem)

{

min_elem = ptr->id_;

}

}

}

if (bfound)

{

ipair_t *item = mp_alloc_box(mp, sizeof(ipair_t), DV_BIN);

div /= 100;

score /= div;

if (niters > 1)

sumpos /= (niters — 1);

item->id_ = min_elem;

item->score_ = score/(1 + (sqrt_long(((100*sumpos)/5))/10));

mp_set_push(mp, &results, item);

cnt1++;


//printf («FOUND:%I64d %d\n», min_elem, score);

}

for (i = 0; i <niters; i++)

{

int bf = bfound;

while (iters[i] && (bf || min_elem == ((ipair_t *)iters[i]->data)->id_))

{

bf = 0;

iters[i] = iters[i]->next;

}

if (!iters[i])

{

fin = 1;

break;

}

}

}

}


for (i = 0; i <niters; i++)

{

DO_SET (ipair_t *, item, &iter_holders[i])

{

cnt++;

//dk_free_box(item);

}

END_DO_SET ();

}

//dk_free_box (iters);

//dk_free_box (iter_holders);


//printf ("-%d-", cnt);

len = dk_set_length(results);

//if (len > 100)

{

results = list_sort (results, compare_by_score, NULL);

i = 0;

DO_SET (ipair_t *, entry, &results)

{

entry->len_ = (i >= 100)? 0:1;

i++;

}

END_DO_SET();

len = (len>100)?100:len;

}

//results = list_sort (results, compare_by_id, NULL);

i = 0;

res = dk_alloc_box(len * 2 * sizeof(ptrlong), DV_ARRAY_OF_LONG);

DO_SET (ipair_t *, entry, &results)

{

if (entry->len_)

{

res[i++] = (entry->id_);

res[i++] = (entry->score_);

}

//dk_free_box(entry);

cnt1--;

}

END_DO_SET();

//dk_set_free (results);

dk_free_tree (words);

//printf("(%d)", cnt1);

}

//mutex_leave (dict_mtx_);

mp_free (mp);


#if 0//defined WIN32 && defined (_DEBUG)

// _CrtMemDumpAllObjectsSince( NULL );

_CrtMemDumpAllObjectsSince( &checkPt1 );

_CrtMemCheckpoint( &checkPt1 );

_CrtMemDumpStatistics( &checkPt1 );

_CrtCheckMemory( );

#endif

return (caddr_t)res;

}


void

init_dict (void)

{

dict_mtx_ = mutex_allocate ();

ht_dict_ = id_hash_allocate (2039, sizeof (caddr_t), sizeof (caddr_t), strhash, strhashcmp);

ht_triples_ = id_hash_allocate (2039, sizeof (lenmem_t), sizeof (caddr_t), lenmemhash, lenmemhashcmp);

ht_dict_by_id_ = hash_table_allocate (2039);


bif_define («nv_split», bif_nv_split);

bif_define («treat_nword», bif_treat_nword);

bif_define («calc_similarity», bif_calc_similarity);

bif_define («reload_dict», bif_reload_dict);

bif_define («get_word_candidates», bif_get_word_candidates);

bif_define («query_phrase», bif_query_phrase);

}


void finit_dict()

{

flush_triples();

flush_dict();


hash_table_free (ht_dict_by_id_);

id_hash_free (ht_triples_);

id_hash_free (ht_dict_);

mutex_free (dict_mtx_);

}


extern int f_foreground;


int

main (int argc, char *argv[])

{

/*f_foreground = 1;

* FIXME: this could not be done in that way; this is a GPF on WIN32 and

* copy on write on linux; a fuinction from the shared object must be used

* to set it

*/

#ifdef MALLOC_DEBUG

dbg_malloc_enable();

#endif

build_set_special_server_model («Mircalo»);

VirtuosoServerSetInitHook (init_dict);

return VirtuosoServerMain (argc, argv);

}


This entry passed through the Full-Text RSS service — if this is your content and you're reading it on someone else's site, please read the FAQ at fivefilters.org/content-only/faq.php#publishers.


Комментариев нет:

Отправить комментарий