#include "thinc_parser.h" #include #include #include #include #include #include #include #include #include #include // ========================================================================= // Minimal JSON parser (replicated from thinc_ner.cpp) // ========================================================================= namespace { struct JVal { enum Type{NUL,OBJ,ARR,STR,NUM,BOOL}type=NUL; std::string str; std::vector arr; std::unordered_map obj; double num=0; const JVal* get(const std::string& k)const{auto it=obj.find(k);return it!=obj.end()?&it->second:nullptr;} int as_int()const{return(int)num;}int64_t as_i64()const{return(int64_t)num;} }; struct JParser { const char *p,*e; char pk(){while(p='0'&&*p<='9'))++p; if(p='0'&&*p<='9'))++p;} if(p='0'&&*p<='9'))++p;} if(s=4&&*p=='t'){v.str="true";p+=4;}else if(e-p>=5&&*p=='f'){v.str="false";p+=5;}return v;} JVal parse(const std::string& j){p=j.data();e=p+j.size();return pv();} }; // ========================================================================= // HashEmbed + MurmurHash (copied from thinc_ner.cpp) // ========================================================================= #define ROTL64(x,r) ((x << r) | (x >> (64 - r))) static uint64_t getblock64(const uint64_t* p, size_t i) { uint64_t r; memcpy(&r, p+i, 8); return r; } static uint64_t fmix64(uint64_t k) { k ^= k >> 33; k *= 0xff51afd7ed558ccdULL; k ^= k >> 33; k *= 0xc4ceb9fe1a85ec53ULL; k ^= k >> 33; return k; } static void mmh3_x64_128(const void* key, int len, uint32_t seed, uint32_t out[4]) { const uint8_t* data=(const uint8_t*)key; int nblocks=len/16; uint64_t h1=seed,h2=seed,c1=0x87c37b91114253d5ULL,c2=0x4cf5ad432745937fULL; const uint64_t* blocks=(const uint64_t*)data; for(int i=0;i>32);out[2]=(uint32_t)h2;out[3]=(uint32_t)(h2>>32); } static uint64_t mh2_64a(const void* key, int len, uint64_t seed) { const uint64_t m=0xc6a4a7935bd1e995ULL;const int r=47; uint64_t h=seed^(uint64_t(len)*m);auto d=(const uint8_t*)key;int rm=len; while(rm>=8){uint64_t k;memcpy(&k,d,8);k*=m;k^=k>>r;k*=m;h^=k;h*=m;d+=8;rm-=8;} switch(rm){case 7:h^=uint64_t(d[6])<<48;case 6:h^=uint64_t(d[5])<<40;case 5:h^=uint64_t(d[4])<<32; case 4:h^=uint64_t(d[3])<<24;case 3:h^=uint64_t(d[2])<<16;case 2:h^=uint64_t(d[1])<<8;case 1:h^=d[0];h*=m;break;} h^=h>>r;h*=m;h^=h>>r;return h; } static uint64_t hash_feat(const std::string& s){return s.empty()?0:mh2_64a(s.data(),(int)s.size(),0);} struct HashEmbed{ int n_rows=0,nO=0;uint32_t seed=0;std::vector table; bool load(int r,int o,const float*d){n_rows=r;nO=o;table.assign(d,d+(size_t)r*o);return!table.empty();} void embed(uint64_t fid,float* out)const{ uint8_t in[8];for(int i=0;i<8;i++)in[i]=(uint8_t)(fid>>(i*8)); uint32_t keys[4];mmh3_x64_128(in,8,seed,keys); for(int v=0;v<4;v++){int idx=(int)(keys[v]%(uint32_t)n_rows);for(int i=0;ibest)best=s;}out[i]=best;} } static void layernorm(float* out, const float* in, int d, const float* G, const float* b, float eps) { float mn=0,vr=0;for(int i=0;i0)memcpy(out,all+(idx-1)*dim,dim*sizeof(float));else memset(out,0,dim*sizeof(float)); memcpy(out+dim,all+off,dim*sizeof(float));if(idx extract_features(const std::string& t, int n_embed){ auto fn=[&](const std::string& s){return hash_feat(s);}; auto fp=[&](const std::string& s){std::string p=s.empty()?"":u8_first(s);std::transform(p.begin(),p.end(),p.begin(),::tolower);return hash_feat(p);}; auto fs=[&](const std::string& s){std::string su=u8_len(s)>=3?u8_last(s,3):s;std::transform(su.begin(),su.end(),su.begin(),::tolower);return hash_feat(su);}; auto fsh=[&](const std::string& t2){std::string sh;for(unsigned char c:t2){if(c>0x7F)sh+='x';else if(std::isupper(c))sh+='X';else if(std::islower(c))sh+='x';else if(std::isdigit(c))sh+='d';else sh+=c;}return hash_feat(sh);}; std::vector ids; std::string lo=t;std::transform(lo.begin(),lo.end(),lo.begin(),::tolower); ids.push_back(fn(lo));ids.push_back(fp(t));ids.push_back(fs(t));ids.push_back(fsh(t)); if(n_embed==6){ids.push_back(1);ids.push_back(0);}else{ids.push_back(0);} return ids; } // ========================================================================= // Tok2vec forward pass (shared with NER) // ========================================================================= struct Tok2vecModel { std::vector embeds; std::vector poW,poB,poG,poB2; bool has_poLN=false; int po_nO=96,po_nP=3,po_nI=576; struct ResBlk{bool has=false;std::vectorW,b,lnG,lnb;}; ResBlk res[4]; int n_res=0; bool load(const std::string& dir) { std::ifstream cf(dir+"/model.ckpt"); if(!cf)return false; std::stringstream cb;cb< bin(bz/4); bf.read((char*)bin.data(),bz); auto sl=[&](int64_t o,int64_t c)->std::vector{ if(o+c>(int64_t)bin.size())return{}; return std::vector(bin.begin()+o,bin.begin()+o+c); }; auto ld=[&](const std::string& k, std::vector* v,int* r0=nullptr,int* r1=nullptr,int* r2=nullptr)->bool{ auto* e=ck.get(k);if(!e)return false; auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); if(!sv||!ov||!cv)return false; *v=sl(ov->as_i64(),cv->as_i64()); if(r0)*r0=sv->arr.size()>=1?sv->arr[0].as_int():1; if(r1)*r1=sv->arr.size()>=2?sv->arr[1].as_int():1; if(r2)*r2=sv->arr.size()>=3?sv->arr[2].as_int():1; return!v->empty(); }; for(int ei=0;;ei++){ auto* e=ck.get("embed_"+std::to_string(ei)+"_E");if(!e)break; auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); if(!sv||!ov||!cv)break;int rs=sv->arr[0].as_int(),nO=sv->arr[1].as_int(); int64_t exp=(int64_t)rs*nO;if(cv->as_i64()as_i64(),cv->as_i64());if(d.empty())break; embeds.emplace_back();embeds.back().load(rs,nO,d.data()); } std::ifstream ff(dir+"/feature_config.json"); if(ff){std::stringstream fb;fb<type==JVal::ARR)for(int i=0;i<(int)sa->arr.size()&&i<(int)embeds.size();i++)embeds[i].seed=(uint32_t)sa->arr[i].as_int();} int r0=0,r1=0,r2=0; if(!ld("poW",&poW,&r0,&r1,&r2))return false; po_nO=r0;po_nP=r1;po_nI=r2; if(!ld("poB",&poB))return false; if(ld("poG",&poG)){ld("poB2",&poB2);has_poLN=true;} for(int ri=0;ri<4;ri++){auto pk="res"+std::to_string(ri);auto& rb=res[ri]; if(ld(pk+"W",&rb.W,&r0,&r1,&r2)){ld(pk+"B",&rb.b);ld(pk+"lnG",&rb.lnG);ld(pk+"lnb",&rb.lnb);rb.has=true;n_res++;}} return!embeds.empty(); } // Run tok2vec → (n_tokens, 96) void forward(const std::vector& tokens, float* out) { int n=(int)tokens.size(),D=96,NE=(int)embeds.size(),EC=NE*D; std::vector emb((size_t)n*EC,0); for(int i=0;i pe((size_t)n*D); for(int i=0;i pln((size_t)n*D,0); if(has_poLN)for(int i=0;i enc=pln; for(int ri=0;ri exp((size_t)n*wd); for(int i=0;i mx((size_t)n*D);for(int i=0;i ln((size_t)n*D);if(!res[ri].lnG.empty())for(int i=0;i pW_hid,pb_hid; // 96→64 std::vector pW_pre,pb_pre,pad_pre; // preaffine std::vector pW_cls,pb_cls; // classifier std::vector move_names; bool load(const std::string& dir) { std::ifstream cf(dir+"/model.ckpt"); if(!cf)return false; std::stringstream cb;cb< bin(bz/4); bf.read((char*)bin.data(),bz); auto sl=[&](int64_t o,int64_t c)->std::vector{ if(o+c>(int64_t)bin.size())return{}; return std::vector(bin.begin()+o,bin.begin()+o+c); }; auto ld=[&](const std::string& k, std::vector* v,int* r0=nullptr,int* r1=nullptr,int* r2=nullptr)->bool{ auto* e=ck.get(k);if(!e)return false; auto sv=e->get("shape"),ov=e->get("offset"),cv=e->get("count"); if(!sv||!ov||!cv)return false; *v=sl(ov->as_i64(),cv->as_i64()); if(r0)*r0=sv->arr.size()>=1?sv->arr[0].as_int():1; if(r1)*r1=sv->arr.size()>=2?sv->arr[1].as_int():1; if(r2)*r2=sv->arr.size()>=3?sv->arr[2].as_int():1; return!v->empty(); }; int r0,r1,r2; if(!ld("pW_hid",&pW_hid,&r0,&r1))return false; nO=r0;ld("pb_hid",&pb_hid); if(!ld("pW_pre",&pW_pre,&r0,&r1,&r2))return false; nP=r0;nO=r1;nI=r2; ld("pb_pre",&pb_pre);ld("pad_pre",&pad_pre); if(!ld("pW_cls",&pW_cls,&r0,&r1))return false; n_actions=r0;ld("pb_cls",&pb_cls); std::ifstream mf(dir+"/meta.json"); if(mf){std::stringstream mb;mb<type==JVal::ARR)for(auto& v:mn->arr)move_names.push_back(v.str);} return!pW_hid.empty() && !pW_pre.empty() && !pW_cls.empty(); } // Run parser forward + state machine → (heads, labels) void parse(const float* tokvecs, int n_tokens, std::vector& out_heads, std::vector& out_labels) { // 1. Hidden layer: 96→64 std::vector hidden((size_t)n_tokens*nO,0); for(int i=0;i precomp((size_t)(n_tokens+1)*nP*nO*nI,0); // Pad token (index 0) memcpy(precomp.data(), pad_pre.data(), (size_t)nP*nO*nI*sizeof(float)); // Real tokens for(int i=0;i float { int ri = (idx < 0 || idx >= n_tokens) ? 0 : idx + 1; return precomp[(size_t)ri*nP*nO*nI + (size_t)p*nO*nI + (size_t)w + (size_t)o*nI]; }; // 3. Arc-hybrid state machine out_heads.assign(n_tokens, -1); out_labels.assign(n_tokens, ""); std::vector stack; std::vector buffer(n_tokens); for(int i=0;iint{ for(int i=0;iint{ int r=-1; for(int i=0;i scores(n_actions,0); std::vector feats(nP*nI*nO,0); for(int step=0; step