From 5af361ed688759722249140d8df16bfd8073be43 Mon Sep 17 00:00:00 2001 From: Yingfeng Date: Tue, 30 Jun 2026 21:40:24 +0800 Subject: [PATCH] Add spacy based ner and relationship extractor for both python and Go version with equivalent outputs (#16456) As title --- internal/cpp/CMakeLists.txt | 4 + internal/cpp/rag_analyzer_c_api.h | 25 + internal/cpp/thinc_ner.cpp | 610 ++++++++++++++ internal/cpp/thinc_ner.h | 45 + internal/cpp/thinc_parser.cpp | 530 ++++++++++++ internal/cpp/thinc_parser.h | 49 ++ .../compilation/extractor/dep_relation.go | 788 ++++++++++++++++++ .../ingestion/compilation/extractor/ner.go | 414 +++++++++ .../compilation/extractor/ner_extractor.go | 30 + .../compilation/extractor/ner_relation.go | 439 ++++++++++ .../compilation/extractor/ner_test.go | 113 +++ .../compilation/extractor/parser_go.go | 87 ++ rag/graphrag/ner/__init__.py | 12 +- rag/graphrag/ner/dep_relation_extractor.py | 558 +++++++++++++ rag/graphrag/ner/ner_extractor.py | 243 ++++++ rag/graphrag/ner/types.py | 75 ++ 16 files changed, 4020 insertions(+), 2 deletions(-) create mode 100644 internal/cpp/thinc_ner.cpp create mode 100644 internal/cpp/thinc_ner.h create mode 100644 internal/cpp/thinc_parser.cpp create mode 100644 internal/cpp/thinc_parser.h create mode 100644 internal/ingestion/compilation/extractor/dep_relation.go create mode 100644 internal/ingestion/compilation/extractor/ner.go create mode 100644 internal/ingestion/compilation/extractor/ner_extractor.go create mode 100644 internal/ingestion/compilation/extractor/ner_relation.go create mode 100644 internal/ingestion/compilation/extractor/ner_test.go create mode 100644 internal/ingestion/compilation/extractor/parser_go.go create mode 100644 rag/graphrag/ner/dep_relation_extractor.py create mode 100644 rag/graphrag/ner/ner_extractor.py create mode 100644 rag/graphrag/ner/types.py diff --git a/internal/cpp/CMakeLists.txt b/internal/cpp/CMakeLists.txt index 2f017a6b90..cf1ff81a1b 100644 --- a/internal/cpp/CMakeLists.txt +++ b/internal/cpp/CMakeLists.txt @@ -133,6 +133,10 @@ set_target_properties(rag_tokenizer PROPERTIES add_library(rag_tokenizer_c_api STATIC rag_analyzer_c_api.cpp rag_analyzer_c_api.h + thinc_ner.cpp + thinc_ner.h + thinc_parser.cpp + thinc_parser.h rag_analyzer.cpp rag_analyzer.h dart_trie.h diff --git a/internal/cpp/rag_analyzer_c_api.h b/internal/cpp/rag_analyzer_c_api.h index 2a87400013..63f34f4f3a 100644 --- a/internal/cpp/rag_analyzer_c_api.h +++ b/internal/cpp/rag_analyzer_c_api.h @@ -99,6 +99,31 @@ char* RAGAnalyzer_GetTermTag(RAGAnalyzerHandle handle, const char* term); // Returns: handle to the new analyzer instance, or NULL on failure RAGAnalyzerHandle RAGAnalyzer_Copy(RAGAnalyzerHandle handle); +// --------------------------------------------------------------------------- +// Named Entity Recognition (spaCy model inference) +// --------------------------------------------------------------------------- + +// Create a ThincNER inference handle. +// model_ner_dir: path to the spaCy model's ner/ component directory +// model_vocab_dir: path to the spaCy model's vocab/ directory (optional, can be NULL) +// Returns: handle, or NULL on failure +RAGAnalyzerHandle ThincNER_Create(const char* model_ner_dir, const char* model_vocab_dir); + +// Destroy a ThincNER handle. +void ThincNER_Destroy(RAGAnalyzerHandle handle); + +// Run NER on pre-tokenized text. +// tokens_json: JSON array e.g. ["Apple","Inc.","was","founded","by","Steve","Jobs","."] +// Returns JSON array of entities, caller must free with ThincNER_FreeString. +char* ThincNER_Predict(RAGAnalyzerHandle handle, const char* tokens_json); + +// Tokenize text using spaCy-compatible rules. +// Returns JSON array of token strings, caller must free with ThincNER_FreeString. +char* ThincNER_Tokenize(const char* text, const char* lang); + +// Free a string returned by ThincNER_Predict or ThincNER_Tokenize. +void ThincNER_FreeString(char* ptr); + #ifdef __cplusplus } #endif diff --git a/internal/cpp/thinc_ner.cpp b/internal/cpp/thinc_ner.cpp new file mode 100644 index 0000000000..4d4163b167 --- /dev/null +++ b/internal/cpp/thinc_ner.cpp @@ -0,0 +1,610 @@ +#pragma STDC FP_CONTRACT OFF + +#include "thinc_ner.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ========================================================================= +// JSON parser (minimal) +// ========================================================================= +namespace { +std::string trim(const std::string& s) { + auto a = s.find_first_not_of(" \t\r\n"); + return a == std::string::npos ? "" : s.substr(a, s.find_last_not_of(" \t\r\n")-a+1); +} +struct JVal { + enum Type {NUL,OBJ,ARR,STR,NUM,BOOL} type=NUL; + std::string str; std::vector arr; std::unordered_map obj; double num=0; + const JVal* get(const std::string& k) const { auto it=obj.find(k); return it!=obj.end()?&it->second:nullptr; } + int as_int() const { return (int)num; } int64_t as_i64() const { return (int64_t)num; } +}; +struct JParser { + const char *p,*e; char pk() { while(p='0'&&*p<='9'))++p; + if(p='0'&&*p<='9'))++p;} + if(p='0'&&*p<='9'))++p;} + if(s=4&&*p=='t'){v.str="true";p+=4;}else if(e-p>=5&&*p=='f'){v.str="false";p+=5;} return v; } + JVal parse(const std::string& j) { p=j.data(); e=p+j.size(); return pv(); } +}; + +// ========================================================================= +// MurmurHash2 64-bit (vocab string→ID, seed=0 matching spaCy StringStore) +// ========================================================================= +static uint64_t mh2_64a(const void* key, int len, uint64_t seed) { + const uint64_t m=0xc6a4a7935bd1e995ULL; const int r=47; + uint64_t h=seed^(uint64_t(len)*m); auto d=(const uint8_t*)key; int rm=len; + while(rm>=8){uint64_t k;memcpy(&k,d,8);k*=m;k^=k>>r;k*=m;h^=k;h*=m;d+=8;rm-=8;} + switch(rm){case 7:h^=uint64_t(d[6])<<48;case 6:h^=uint64_t(d[5])<<40; + case 5:h^=uint64_t(d[4])<<32;case 4:h^=uint64_t(d[3])<<24; + case 3:h^=uint64_t(d[2])<<16;case 2:h^=uint64_t(d[1])<<8; + case 1:h^=d[0];h*=m;break;} + h^=h>>r;h*=m;h^=h>>r; return h; +} +static uint64_t hash_feat(const std::string& s) { return s.empty()?0:mh2_64a(s.data(),(int)s.size(),0); } + +// ========================================================================= +// MurmurHash3_x64_128 (exact copy from mmh3 package, verified against thinc) +// ========================================================================= +#define ROTL64(x,r) ((x << r) | (x >> (64 - r))) +static uint64_t getblock64(const uint64_t* p, size_t i) { uint64_t r; memcpy(&r, p+i, 8); return r; } +static uint64_t fmix64(uint64_t k) { + k ^= k >> 33; k *= 0xff51afd7ed558ccdULL; + k ^= k >> 33; k *= 0xc4ceb9fe1a85ec53ULL; + k ^= k >> 33; return k; +} +static void mmh3_x64_128(const void* key, int len, uint32_t seed, uint32_t out[4]) { + const uint8_t* data = (const uint8_t*)key; + int nblocks = len / 16; + uint64_t h1 = seed, h2 = seed; + const uint64_t c1 = 0x87c37b91114253d5ULL; + const uint64_t c2 = 0x4cf5ad432745937fULL; + const uint64_t* blocks = (const uint64_t*)(data); + for (int i = 0; i < nblocks; i++) { + uint64_t k1 = getblock64(blocks, i*2+0); + uint64_t k2 = getblock64(blocks, i*2+1); + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + h1 = ROTL64(h1,27); h1 += h2; h1 = h1 * 5 + 0x52dce729; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + h2 = ROTL64(h2,31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; + } + const uint8_t* tail = (const uint8_t*)(data + nblocks * 16); + uint64_t k1 = 0, k2 = 0; + switch (len & 15) { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[9]) << 8; + case 9: k2 ^= ((uint64_t)tail[8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + case 8: k1 ^= ((uint64_t)tail[7]) << 56; + case 7: k1 ^= ((uint64_t)tail[6]) << 48; + case 6: k1 ^= ((uint64_t)tail[5]) << 40; + case 5: k1 ^= ((uint64_t)tail[4]) << 32; + case 4: k1 ^= ((uint64_t)tail[3]) << 24; + case 3: k1 ^= ((uint64_t)tail[2]) << 16; + case 2: k1 ^= ((uint64_t)tail[1]) << 8; + case 1: k1 ^= ((uint64_t)tail[0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + h1 ^= len; h2 ^= len; + h1 += h2; h2 += h1; + h1 = fmix64(h1); h2 = fmix64(h2); + h1 += h2; h2 += h1; + out[0] = (uint32_t)h1; out[1] = (uint32_t)(h1>>32); + out[2] = (uint32_t)h2; out[3] = (uint32_t)(h2>>32); +} + +// ========================================================================= +// HashEmbed +// ========================================================================= +struct HashEmbed { + int n_rows=0,nO=0; uint32_t seed=0; std::vector table; + bool load(int r, int o, const float* d) { n_rows=r;nO=o;table.assign(d,d+(size_t)r*o);return!table.empty(); } + void embed(uint64_t fid, float* out) const { + uint8_t in[8]; for(int i=0;i<8;i++)in[i]=(uint8_t)(fid>>(i*8)); + uint32_t keys[4]; mmh3_x64_128(in,8,seed,keys); + for(int v=0;v<4;v++){int idx=(int)(keys[v]%(uint32_t)n_rows);for(int i=0;i=3 ? utf8_last(t,3) : t; + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + return hash_feat(s); +} +static uint64_t feat_shape(const std::string& t) { + std::string sh; + for(unsigned char c:t){ + if(c>0x7F)sh+='x'; // CJK → 'x' (matches spaCy zh shape) + else if(std::isupper(c))sh+='X'; + else if(std::islower(c))sh+='x'; + else if(std::isdigit(c))sh+='d'; + else sh+=c; + } + return hash_feat(sh); +} +// Extract features based on n_embed count. Returns vector of hash values. +// NER model's tok2vec uses 4 features: NORM, PREFIX, SUFFIX, SHAPE +// (The pipeline's standalone tok2vec uses 6 features including SPACY and IS_SPACE.) +// Feature order matches the HashEmbed table order in the model. +static std::vector extract_features(const std::string& t, int n_embed) { + std::vector ids; + ids.push_back(feat_norm(t)); // #0: NORM (all models) + ids.push_back(feat_prefix(t)); // #1: PREFIX + ids.push_back(feat_suffix(t)); // #2: SUFFIX + ids.push_back(feat_shape(t)); // #3: SHAPE + if(n_embed==5) { + ids.push_back(0); // #4: IS_SPACE (zh/ja: 5-embed models, no SPACY) + } else if(n_embed>=6) { + ids.push_back(1); // #4: SPACY (en/de/fr/es/pt: 6-embed models) + ids.push_back(0); // #5: IS_SPACE + } + return ids; +} + +// ========================================================================= +// Layers +// ========================================================================= + +// Kahan compensated dot product: reduces floating-point accumulation error +// for long dot products (e.g. 576 terms in Maxout). +static float kahan_dot(const float* a, const float* b, int n) { + float sum = 0.0f; + float c = 0.0f; + for (int i = 0; i < n; i++) { + float y = a[i] * b[i] - c; + float t = sum + y; + c = (t - sum) - y; + sum = t; + } + return sum; +} + +static void linear(float* out, const float* in, const float* W, const float* b, int nO, int nI) { + for(int i=0;i0?x[i]:0; } + +// Maxout: y[i] = max_p(b[i,p] + W[i,p,:] @ in) +static void maxout(float* out, const float* in, const float* W, const float* b, int nO, int nP, int nI) { + for(int i=0;ibest)best=s; + } + out[i]=best; + } +} + +// LayerNorm: y = G * (x-mean)/sqrt(var+eps) + b +static void layernorm(float* out, const float* in, int d, const float* G, const float* b, float eps) { + float mn=0,vr=0; for(int i=0;i0)memcpy(out,all+(idx-1)*dim,dim*sizeof(float)); else memset(out,0,dim*sizeof(float)); + memcpy(out+dim,all+off,dim*sizeof(float)); + if(idx decode_biluo(const std::vector& tok, const std::vector& lbl) { + std::vector ents; int n=(int)tok.size(),st=-1; std::string et,ex; + for(int i=0;i=0){ents.push_back({ex,et,st,i-1,0.85f});st=-1;et.clear();ex.clear();}continue;} + if(l.size()<3||l[1]!='-'){if(st>=0){ents.push_back({ex,et,st,i-1,0.85f});st=-1;}continue;} + char a=l[0]; std::string ty=l.substr(2); + if(a=='U'){if(st>=0){ents.push_back({ex,et,st,i-1,0.85f});st=-1;}ents.push_back({tok[i],ty,i,i,0.85f});} + else if(a=='B'){if(st>=0)ents.push_back({ex,et,st,i-1,0.85f});st=i;et=ty;ex=tok[i];} + else if(a=='I'){if(st>=0&&et==ty)ex+=" "+tok[i];else{if(st>=0)ents.push_back({ex,et,st,i-1,0.85f});st=i;et=ty;ex=tok[i];}} + else if(a=='L'){if(st>=0&&et==ty){ex+=" "+tok[i];ents.push_back({ex,et,st,i,0.85f});}else ents.push_back({tok[i],ty,i,i,0.85f});st=-1;et.clear();ex.clear();} + } + if(st>=0)ents.push_back({ex,et,st,n-1,0.85f}); + return ents; +} + +// Tokenizer +static std::vector tokenize_en(const std::string& t) { + std::vector r; std::string cur; + for(size_t i=0;i127)cur+=c; + else if(c=='.'&&!cur.empty()&&i+1 tokenize_zh(const std::string& t) { + std::vector r; + for(size_t i=0;i embeds; + // Post-embed Maxout (576→96) + std::vector poW,poB; int po_nO=96,po_nP=3,po_nI=576; + // Post-embed LayerNorm + std::vector poG,poB2; bool has_poLN=false; + // Residual encoder (4 blocks) + struct ResBlk{bool has=false;std::vectorW,b,lnG,lnb;}; + ResBlk res[4]; int n_res=0; + // NER hidden (96→64) + std::vector hW,hB; int hO=64; bool has_hid=false; + // PrecomputableAffine: W_full[nP=3][nO=64][nI=2][nD=64], b_full[nO=64][nI=2] + // We use f=0 (first feature only): pre_out[p][o] = sum_d W[p][o][0][d] * hid[d] + b[o][0] + std::vector pW_full; // flattened [3*64*2*64] + std::vector pB_full; // flattened [64*2] + int p_nP=3, p_nO=64, p_nI=2, p_nD=64; bool has_pre=false; + // Classifier (64→n_actions) + std::vector cW,cB; int nAct=0; bool has_cls=false; + std::vector actLbl; +}; + +// ========================================================================= +// Load model +// ========================================================================= +static bool load(const std::string& dir, State* s) { + std::ifstream cf(dir+"/model.ckpt"); if(!cf){std::cerr<<"No model.ckpt\n";return false;} + std::stringstream cb;cb< bin(bz/4); bf.read((char*)bin.data(),bz); + + auto sl=[&](int64_t o, int64_t c)->std::vector{ + if(o+c>(int64_t)bin.size())return{}; return std::vector(bin.begin()+o,bin.begin()+o+c); + }; + auto ld=[&](const std::string& k, std::vector* v, int* r0=nullptr,int* r1=nullptr,int* r2=nullptr)->bool{ + auto* e=ck.get(k); if(!e)return false; + auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); + if(!sv||!ov||!cv)return false; + *v=sl(ov->as_i64(),cv->as_i64()); + if(r0)*r0=sv->arr.size()>=1?sv->arr[0].as_int():1; + if(r1)*r1=sv->arr.size()>=2?sv->arr[1].as_int():1; + if(r2)*r2=sv->arr.size()>=3?sv->arr[2].as_int():1; + return!v->empty(); + }; + + // HashEmbeds — dynamic count (6 for en, 5 for zh, etc.) + for(int ei=0;;ei++){ + auto* e=ck.get("embed_"+std::to_string(ei)+"_E"); if(!e)break; + auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); + if(!sv||!ov||!cv)break; + int rs=sv->arr[0].as_int(),nO=sv->arr[1].as_int(); + int64_t expected=(int64_t)rs*nO; + if(cv->as_i64()as_i64(),cv->as_i64()); if(d.empty())break; + s->embeds.emplace_back(); s->embeds.back().load(rs,nO,d.data()); + } + // Seeds + std::ifstream ff(dir+"/feature_config.json"); + if(ff){std::stringstream fb;fb<type==JVal::ARR)for(int i=0;i<(int)sa->arr.size()&&i<(int)s->embeds.size();i++)s->embeds[i].seed=(uint32_t)sa->arr[i].as_int();} + + int r0=0,r1=0,r2=0; + // Post-embed + if(ld("poW",&s->poW,&r0,&r1,&r2)){s->po_nO=r0;s->po_nP=r1;s->po_nI=r2;ld("poB",&s->poB);} + if(ld("poG",&s->poG)){ld("poB2",&s->poB2);s->has_poLN=true;} + // Residual + for(int ri=0;ri<4;ri++){auto pk="res"+std::to_string(ri);auto& rb=s->res[ri]; + if(ld(pk+"W",&rb.W,&r0,&r1,&r2)){ld(pk+"B",&rb.b);ld(pk+"lnG",&rb.lnG);ld(pk+"lnb",&rb.lnb);rb.has=true;s->n_res++;}} + // NER hidden + if(ld("hW",&s->hW,&r0,&r1)){s->hO=r0;ld("hB",&s->hB);s->has_hid=true;} + // PrecomputableAffine: load full 4D W and 2D b + // has_pre is only set when ALL of weight buffer, bias buffer, + // and hidden-dimension match (p_nD == hO) are satisfied. + { + auto* e=ck.get("pW_full"); if(e){ + auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); + if(sv&&ov&&cv){ + int nP=sv->arr.size()>=1?sv->arr[0].as_int():1; + int nO=sv->arr.size()>=2?sv->arr[1].as_int():1; + int nI=sv->arr.size()>=3?sv->arr[2].as_int():1; + int nD=sv->arr.size()>=4?sv->arr[3].as_int():1; + s->p_nP=nP; s->p_nO=nO; s->p_nI=nI; s->p_nD=nD; + size_t total = (size_t)nP * nO * nI * nD; + s->pW_full = sl(ov->as_i64(), cv->as_i64()); + bool pw_ok = s->pW_full.size() >= total; + + // Load bias inside pW_full block to access dimension info + bool pb_ok = false; + if(auto* pb_e=ck.get("pB_full")){ + auto pb_ov=pb_e->get("offset"),pb_cv=pb_e->get("count"); + if(pb_ov&&pb_cv){ + s->pB_full = sl(pb_ov->as_i64(), pb_cv->as_i64()); + pb_ok = s->pB_full.size() >= (size_t)nO * nI; + } + } + + bool dim_ok = (nD == s->hO); + s->has_pre = pw_ok && pb_ok && dim_ok; + } + } + } + // Classifier + if(ld("cW",&s->cW,&r0,&r1)){s->nAct=r0;ld("cB",&s->cB);s->has_cls=true;} + + return!s->embeds.empty(); +} + +static bool load_labels(const std::string& dir, State* s) { + std::ifstream f(dir+"/labels.json"); if(!f)return false; + std::stringstream b;b<type!=JVal::OBJ)return false; + int mx=0; for(auto&[k,v]:am->obj){try{int a=std::stoi(k);if(a>mx)mx=a;}catch(...){}}; + int n=s->nAct>0?s->nAct:mx+1; s->actLbl.resize(n,"O"); + for(auto&[k,v]:am->obj){try{int a=std::stoi(k);if(a>=0&&aactLbl[a]=v.str;}catch(...){}} + return!s->actLbl.empty(); +} + +// ========================================================================= +// C API +// ========================================================================= +ThincNERHandle ThincNER_Create(const char* d, const char*) { + auto* s=new State(); if(!load(d,s)){delete s;return nullptr;} + if(!load_labels(d,s))s->actLbl.resize(74,"O"); + return s; +} +void ThincNER_Destroy(ThincNERHandle h) { delete (State*)h; } + +char* ThincNER_Predict(ThincNERHandle h, const char* tj) { + auto* s=(State*)h; if(!s)return strdup("[]"); + if(!tj)return strdup("[]"); + + // Parse tokens + std::vector tok; std::string j(tj); size_t p=0; + while((p=j.find('"',p))!=std::string::npos){auto e=j.find('"',p+1);if(e==std::string::npos)break;std::string t=j.substr(p+1,e-p-1);if(!t.empty())tok.push_back(t);p=e+1;} + int n=(int)tok.size(); if(!n)return strdup("[]"); + int NE=(int)s->embeds.size(); + // Derive per-embed dimension from loaded tensors (all embed tables share the same nO) + int D = NE > 0 ? s->embeds[0].nO : 96; + int EC = NE * D; + + // ---- Step 1: HashEmbed → concat (NER model: 4×96=384, pipe: 6×96=576) ---- + std::vector emb((size_t)n*EC,0); + for(int i=0;iembeds.size();e++) + s->embeds[e].embed(ids[e], emb.data()+b + (size_t)e*s->embeds[e].nO); + } + + + // ---- Step 2: Post-embed Maxout (576→96) ---- + std::vector pe((size_t)n*D); + for(int i=0;ipoW.data(),s->poB.data(),s->po_nO,s->po_nP,s->po_nI); + + // ---- Step 3: Post-embed LayerNorm ---- + std::vector pln((size_t)n*D,0); + if(s->has_poLN){for(int i=0;ipoG.data(),s->poB2.data(),1e-6f);} + else pln=pe; + + // ---- Step 4: Residual encoder blocks ---- + std::vector enc=pln; + for(int ri=0;rin_res;ri++){ + auto& blk=s->res[ri]; if(!blk.has)continue; + int wd=D*3; + std::vector exp((size_t)n*wd); + for(int i=0;i mx((size_t)n*D); + for(int i=0;i ln((size_t)n*D); + if(!blk.lnG.empty()){for(int i=0;i hid((size_t)n*s->hO); + if(s->has_hid){for(int i=0;ihO,enc.data()+(size_t)i*D,s->hW.data(),s->hB.data(),s->hO,D);}} + else{for(int i=0;ihO);memcpy(hid.data()+i*s->hO,enc.data()+i*D,c*4);}} + + // ---- Step 6: PrecomputableAffine → Maxout → Classifier → constrained decoding ---- + // Matches the spaCy ParserStepModel's predict_states formula: + // cached[t][f][o*nP+p] = W[f,o,p,:] @ hid[t] + b[o,p] (f=0..nF-1, W=[nF,nO,nP,nI]) + // unmaxed[o,nP+p] = sum_f cached[t][f][o*nP+p] (sum over nF features) + // unmaxed += bias[o,p] (add bias once, not nF times) + // hid_vec[o] = max(unmaxed[o*nP+0], unmaxed[o*nP+1]) + // scores[a] = cW[a][:] @ hid_vec + cB[a] + // + // Feature token indices match spaCy's BiluoPushDown transition system: + // f=0: B(0) = buffer front = current token index + // f=1: S(0) = stack top = entity_start if in entity, else back-off to B(0) + // f=2: S(1) = stack second = back-off to B(0) (stack has ≤1 item in simple case) + auto label_type = [](const std::string& lbl) -> char { return lbl.empty() ? 'O' : lbl[0]; }; + auto label_etype = [](const std::string& lbl) -> std::string { return lbl.size()<3?"":lbl.substr(2); }; + + std::vector tl(n, "O"); + if(s->has_cls && s->has_pre){ + int nF=s->p_nP, nO=s->p_nO, nP=s->p_nI, nD=s->p_nD; // W: [nF, nO, nP, nI] + std::vector unmaxed((size_t)nO * nP, 0); + std::vector hid_vec(nO, 0); + std::vector scores(s->nAct, 0); + int entity_start = -1; // token index of current B-entity start, -1 = no entity + + for(int i=0;i= 0) { + ft[1] = entity_start; // S(0) = entity start + ft[2] = i; // S(1) = back-off to B(0) + } else { + ft[1] = i; // S(0) = back-off to B(0) + ft[2] = i; // S(1) = back-off to B(0) + } + + // PrecomputableAffine: pre[f][o][p] = W[f][o][p][:] @ hid[ft[f]] + b[o][p] + memset(unmaxed.data(), 0, (size_t)nO * nP * sizeof(float)); + for(int f=0;fpW_full[base + d] * hf[d]; + } + unmaxed[(size_t)o * nP + p] += val; + } + } + } + + // Add bias ONCE (not nF times) + for(int o=0;opB_full[(size_t)o * nP + p]; + } + } + + // Maxout: hid_vec[o] = max_p unmaxed[o*nP + p] + for(int o=0;o best) best = v; + } + hid_vec[o] = best; + } + + // Classifier: scores = cW @ hid_vec + cB + linear(scores.data(), hid_vec.data(), s->cW.data(), s->cB.data(), s->nAct, nO); + + // Constrained greedy decoding + char prev_type = i>0 ? label_type(tl[i-1]) : 'O'; + std::string prev_etype = i>0 ? label_etype(tl[i-1]) : ""; + int bst=-1; float bv=-1e30f; + for(int a=0;anAct;a++){ + const std::string& lbl = (a<(int)s->actLbl.size()) ? s->actLbl[a] : "O"; + if(lbl.empty()) continue; + char ct = label_type(lbl); + std::string ce = label_etype(lbl); + bool valid=false; + if(prev_type=='O'||prev_type=='L'||prev_type=='U') + valid = (ct=='O'||ct=='B'||ct=='U'); + else if(prev_type=='B'||prev_type=='I'){ + if(ct=='O') valid=true; + else if((ct=='I'||ct=='L')&&ce==prev_etype) valid=true; + } + if(!valid) continue; + if(scores[a]>bv){bv=scores[a];bst=a;} + } + if(bst>=0) { + tl[i] = s->actLbl[bst]; + // Update entity_start for next token (BiluoPushDown stack tracking) + char ct = label_type(tl[i]); + if(ct == 'B') entity_start = i; + else if(ct == 'I' || ct == 'L') { /* entity continues, keep entity_start */ } + else entity_start = -1; // O or U → no entity open + } + } + } + + // ---- Step 8: BILUO decode ---- + auto ents=decode_biluo(tok,tl); + std::string r="["; for(size_t i=0;i en tokenizer; zh, ja: CJK -> zh tokenizer + bool is_cjk = (lang == "zh" || lang == "ja"); + auto tok = is_cjk ? tokenize_zh(t) : tokenize_en(t); + std::string r="["; for(size_t i=0;i +#include + +// --------------------------------------------------------------------------- +// C API for spaCy model inference (en_core_web_sm / zh_core_web_sm) +// +// Loads model.ckpt + model.bin directly from a spaCy model directory. +// --------------------------------------------------------------------------- + +typedef void* ThincNERHandle; + +// Create / destroy an inference handle for a single spaCy model. +// model_dir: path to the model component directory, e.g. +// "models/en_core_web_sm-3.7.1/ner/" +// "models/zh_core_web_sm-3.7.1/ner/" +// Returns NULL on failure. +ThincNERHandle ThincNER_Create(const char* model_ner_dir, const char* model_vocab_dir); +void ThincNER_Destroy(ThincNERHandle handle); + +// Run NER on pre-tokenized text. +// tokens_json: JSON array of token strings, e.g. ["Apple", "Inc.", "was", ...] +// Returns JSON array of entities: +// [{"text":"Apple Inc.","label":"ORG","start":0,"end":10,"confidence":0.85}, ...] +// Caller must free with ThincNER_FreeString. +char* ThincNER_Predict(ThincNERHandle handle, const char* tokens_json); + +// Free a string returned by ThincNER_Predict. +void ThincNER_FreeString(char* ptr); + +// Utility: tokenize text using spaCy-compatible rules. +// Returns JSON array of token strings. +char* ThincNER_Tokenize(const char* text, const char* lang); + +#ifdef __cplusplus +} +#endif + +#endif // THINC_NER_H diff --git a/internal/cpp/thinc_parser.cpp b/internal/cpp/thinc_parser.cpp new file mode 100644 index 0000000000..c7b0082104 --- /dev/null +++ b/internal/cpp/thinc_parser.cpp @@ -0,0 +1,530 @@ +#include "thinc_parser.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ========================================================================= +// Minimal JSON parser (replicated from thinc_ner.cpp) +// ========================================================================= +namespace { +struct JVal { + enum Type{NUL,OBJ,ARR,STR,NUM,BOOL}type=NUL; + std::string str; std::vector arr; std::unordered_map obj; double num=0; + const JVal* get(const std::string& k)const{auto it=obj.find(k);return it!=obj.end()?&it->second:nullptr;} + int as_int()const{return(int)num;}int64_t as_i64()const{return(int64_t)num;} +}; +struct JParser { + const char *p,*e; + char pk(){while(p='0'&&*p<='9'))++p; + if(p='0'&&*p<='9'))++p;} + if(p='0'&&*p<='9'))++p;} + if(s=4&&*p=='t'){v.str="true";p+=4;}else if(e-p>=5&&*p=='f'){v.str="false";p+=5;}return v;} + JVal parse(const std::string& j){p=j.data();e=p+j.size();return pv();} +}; + +// ========================================================================= +// HashEmbed + MurmurHash (copied from thinc_ner.cpp) +// ========================================================================= +#define ROTL64(x,r) ((x << r) | (x >> (64 - r))) +static uint64_t getblock64(const uint64_t* p, size_t i) { uint64_t r; memcpy(&r, p+i, 8); return r; } +static uint64_t fmix64(uint64_t k) { + k ^= k >> 33; k *= 0xff51afd7ed558ccdULL; k ^= k >> 33; k *= 0xc4ceb9fe1a85ec53ULL; k ^= k >> 33; return k; +} +static void mmh3_x64_128(const void* key, int len, uint32_t seed, uint32_t out[4]) { + const uint8_t* data=(const uint8_t*)key; int nblocks=len/16; + uint64_t h1=seed,h2=seed,c1=0x87c37b91114253d5ULL,c2=0x4cf5ad432745937fULL; + const uint64_t* blocks=(const uint64_t*)data; + for(int i=0;i>32);out[2]=(uint32_t)h2;out[3]=(uint32_t)(h2>>32); +} + +static uint64_t mh2_64a(const void* key, int len, uint64_t seed) { + const uint64_t m=0xc6a4a7935bd1e995ULL;const int r=47; + uint64_t h=seed^(uint64_t(len)*m);auto d=(const uint8_t*)key;int rm=len; + while(rm>=8){uint64_t k;memcpy(&k,d,8);k*=m;k^=k>>r;k*=m;h^=k;h*=m;d+=8;rm-=8;} + switch(rm){case 7:h^=uint64_t(d[6])<<48;case 6:h^=uint64_t(d[5])<<40;case 5:h^=uint64_t(d[4])<<32; + case 4:h^=uint64_t(d[3])<<24;case 3:h^=uint64_t(d[2])<<16;case 2:h^=uint64_t(d[1])<<8;case 1:h^=d[0];h*=m;break;} + h^=h>>r;h*=m;h^=h>>r;return h; +} +static uint64_t hash_feat(const std::string& s){return s.empty()?0:mh2_64a(s.data(),(int)s.size(),0);} + +struct HashEmbed{ + int n_rows=0,nO=0;uint32_t seed=0;std::vector table; + bool load(int r,int o,const float*d){n_rows=r;nO=o;table.assign(d,d+(size_t)r*o);return!table.empty();} + void embed(uint64_t fid,float* out)const{ + uint8_t in[8];for(int i=0;i<8;i++)in[i]=(uint8_t)(fid>>(i*8)); + uint32_t keys[4];mmh3_x64_128(in,8,seed,keys); + for(int v=0;v<4;v++){int idx=(int)(keys[v]%(uint32_t)n_rows);for(int i=0;ibest)best=s;}out[i]=best;} +} +static void layernorm(float* out, const float* in, int d, const float* G, const float* b, float eps) { + float mn=0,vr=0;for(int i=0;i0)memcpy(out,all+(idx-1)*dim,dim*sizeof(float));else memset(out,0,dim*sizeof(float)); + memcpy(out+dim,all+off,dim*sizeof(float));if(idx extract_features(const std::string& t, int n_embed){ + auto fn=[&](const std::string& s){return hash_feat(s);}; + auto fp=[&](const std::string& s){std::string p=s.empty()?"":u8_first(s);std::transform(p.begin(),p.end(),p.begin(),::tolower);return hash_feat(p);}; + auto fs=[&](const std::string& s){std::string su=u8_len(s)>=3?u8_last(s,3):s;std::transform(su.begin(),su.end(),su.begin(),::tolower);return hash_feat(su);}; + auto fsh=[&](const std::string& t2){std::string sh;for(unsigned char c:t2){if(c>0x7F)sh+='x';else if(std::isupper(c))sh+='X';else if(std::islower(c))sh+='x';else if(std::isdigit(c))sh+='d';else sh+=c;}return hash_feat(sh);}; + std::vector ids; + std::string lo=t;std::transform(lo.begin(),lo.end(),lo.begin(),::tolower); + ids.push_back(fn(lo));ids.push_back(fp(t));ids.push_back(fs(t));ids.push_back(fsh(t)); + if(n_embed==6){ids.push_back(1);ids.push_back(0);}else{ids.push_back(0);} + return ids; +} + +// ========================================================================= +// Tok2vec forward pass (shared with NER) +// ========================================================================= +struct Tok2vecModel { + std::vector embeds; + std::vector poW,poB,poG,poB2; bool has_poLN=false; + int po_nO=96,po_nP=3,po_nI=576; + struct ResBlk{bool has=false;std::vectorW,b,lnG,lnb;}; + ResBlk res[4]; int n_res=0; + + bool load(const std::string& dir) { + std::ifstream cf(dir+"/model.ckpt"); if(!cf)return false; + std::stringstream cb;cb< bin(bz/4); bf.read((char*)bin.data(),bz); + auto sl=[&](int64_t o,int64_t c)->std::vector{ + if(o+c>(int64_t)bin.size())return{}; return std::vector(bin.begin()+o,bin.begin()+o+c); + }; + auto ld=[&](const std::string& k, std::vector* v,int* r0=nullptr,int* r1=nullptr,int* r2=nullptr)->bool{ + auto* e=ck.get(k);if(!e)return false; + auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); + if(!sv||!ov||!cv)return false; + *v=sl(ov->as_i64(),cv->as_i64()); + if(r0)*r0=sv->arr.size()>=1?sv->arr[0].as_int():1; + if(r1)*r1=sv->arr.size()>=2?sv->arr[1].as_int():1; + if(r2)*r2=sv->arr.size()>=3?sv->arr[2].as_int():1; + return!v->empty(); + }; + for(int ei=0;;ei++){ + auto* e=ck.get("embed_"+std::to_string(ei)+"_E");if(!e)break; + auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); + if(!sv||!ov||!cv)break;int rs=sv->arr[0].as_int(),nO=sv->arr[1].as_int(); + int64_t exp=(int64_t)rs*nO;if(cv->as_i64()as_i64(),cv->as_i64());if(d.empty())break; + embeds.emplace_back();embeds.back().load(rs,nO,d.data()); + } + std::ifstream ff(dir+"/feature_config.json"); + if(ff){std::stringstream fb;fb<type==JVal::ARR)for(int i=0;i<(int)sa->arr.size()&&i<(int)embeds.size();i++)embeds[i].seed=(uint32_t)sa->arr[i].as_int();} + int r0=0,r1=0,r2=0; + if(!ld("poW",&poW,&r0,&r1,&r2))return false; + po_nO=r0;po_nP=r1;po_nI=r2; + if(!ld("poB",&poB))return false; + if(ld("poG",&poG)){ld("poB2",&poB2);has_poLN=true;} + for(int ri=0;ri<4;ri++){auto pk="res"+std::to_string(ri);auto& rb=res[ri]; + if(ld(pk+"W",&rb.W,&r0,&r1,&r2)){ld(pk+"B",&rb.b);ld(pk+"lnG",&rb.lnG);ld(pk+"lnb",&rb.lnb);rb.has=true;n_res++;}} + return!embeds.empty(); + } + + // Run tok2vec → (n_tokens, 96) + void forward(const std::vector& tokens, float* out) { + int n=(int)tokens.size(),D=96,NE=(int)embeds.size(),EC=NE*D; + std::vector emb((size_t)n*EC,0); + for(int i=0;i pe((size_t)n*D); + for(int i=0;i pln((size_t)n*D,0); + if(has_poLN)for(int i=0;i enc=pln; + for(int ri=0;ri exp((size_t)n*wd); + for(int i=0;i mx((size_t)n*D);for(int i=0;i ln((size_t)n*D);if(!res[ri].lnG.empty())for(int i=0;i pW_hid,pb_hid; // 96→64 + std::vector pW_pre,pb_pre,pad_pre; // preaffine + std::vector pW_cls,pb_cls; // classifier + std::vector move_names; + + bool load(const std::string& dir) { + std::ifstream cf(dir+"/model.ckpt"); if(!cf)return false; + std::stringstream cb;cb< bin(bz/4); bf.read((char*)bin.data(),bz); + auto sl=[&](int64_t o,int64_t c)->std::vector{ + if(o+c>(int64_t)bin.size())return{}; return std::vector(bin.begin()+o,bin.begin()+o+c); + }; + auto ld=[&](const std::string& k, std::vector* v,int* r0=nullptr,int* r1=nullptr,int* r2=nullptr)->bool{ + auto* e=ck.get(k);if(!e)return false; + auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); + if(!sv||!ov||!cv)return false; + *v=sl(ov->as_i64(),cv->as_i64()); + if(r0)*r0=sv->arr.size()>=1?sv->arr[0].as_int():1; + if(r1)*r1=sv->arr.size()>=2?sv->arr[1].as_int():1; + if(r2)*r2=sv->arr.size()>=3?sv->arr[2].as_int():1; + return!v->empty(); + }; + int r0,r1,r2; + if(!ld("pW_hid",&pW_hid,&r0,&r1))return false; + nO=r0;ld("pb_hid",&pb_hid); + if(!ld("pW_pre",&pW_pre,&r0,&r1,&r2))return false; + nP=r0;nO=r1;nI=r2; + ld("pb_pre",&pb_pre);ld("pad_pre",&pad_pre); + if(!ld("pW_cls",&pW_cls,&r0,&r1))return false; + n_actions=r0;ld("pb_cls",&pb_cls); + std::ifstream mf(dir+"/meta.json"); + if(mf){std::stringstream mb;mb<type==JVal::ARR)for(auto& v:mn->arr)move_names.push_back(v.str);} + return!pW_hid.empty() && !pW_pre.empty() && !pW_cls.empty(); + } + + // Run parser forward + state machine → (heads, labels) + void parse(const float* tokvecs, int n_tokens, + std::vector& out_heads, std::vector& out_labels) { + // 1. Hidden layer: 96→64 + std::vector hidden((size_t)n_tokens*nO,0); + for(int i=0;i precomp((size_t)(n_tokens+1)*nP*nO*nI,0); + // Pad token (index 0) + memcpy(precomp.data(), pad_pre.data(), (size_t)nP*nO*nI*sizeof(float)); + // Real tokens + for(int i=0;i float { + int ri = (idx < 0 || idx >= n_tokens) ? 0 : idx + 1; + return precomp[(size_t)ri*nP*nO*nI + (size_t)p*nO*nI + (size_t)w + (size_t)o*nI]; + }; + + // 3. Arc-hybrid state machine + out_heads.assign(n_tokens, -1); + out_labels.assign(n_tokens, ""); + std::vector stack; + std::vector buffer(n_tokens); + for(int i=0;iint{ + for(int i=0;iint{ + int r=-1; for(int i=0;i scores(n_actions,0); + std::vector feats(nP*nI*nO,0); + + for(int step=0; step