diff --git a/rag/flow/tokenizer/tokenizer.py b/rag/flow/tokenizer/tokenizer.py index 467594a312..d38554aec4 100644 --- a/rag/flow/tokenizer/tokenizer.py +++ b/rag/flow/tokenizer/tokenizer.py @@ -90,24 +90,22 @@ class Tokenizer(ProcessBase): vts, c = embedding_model.encode([name]) token_count += c - tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0) + tts = np.tile(vts[0], (len(texts), 1)) @timeout(60) def batch_encode(txts): nonlocal embedding_model return embedding_model.encode([truncate(c, embedding_model.max_length - 10) for c in txts]) - cnts_ = np.array([]) + cnts_batches = [] for i in range(0, len(texts), settings.EMBEDDING_BATCH_SIZE): async with embed_limiter: vts, c = await thread_pool_exec(batch_encode,texts[i : i + settings.EMBEDDING_BATCH_SIZE],) - if len(cnts_) == 0: - cnts_ = vts - else: - cnts_ = np.concatenate((cnts_, vts), axis=0) + cnts_batches.append(vts) token_count += c if i % 33 == 32: self.callback(i * 1.0 / len(texts) / parts / settings.EMBEDDING_BATCH_SIZE + 0.5 * (parts - 1)) + cnts_ = np.vstack(cnts_batches) if cnts_batches else np.array([]) cnts = cnts_ title_w = float(self._param.filename_embd_weight) diff --git a/rag/llm/embedding_model.py b/rag/llm/embedding_model.py index 2d9f0b322c..9fe1095527 100644 --- a/rag/llm/embedding_model.py +++ b/rag/llm/embedding_model.py @@ -73,14 +73,12 @@ class BuiltinEmbed(Base): batch_size = 16 # TEI is able to auto truncate inputs according to https://github.com/huggingface/text-embeddings-inference. token_count = 0 - ress = None + batches = [] for i in range(0, len(texts), batch_size): embeddings, token_count_delta = self._model.encode(texts[i : i + batch_size]) token_count += token_count_delta - if ress is None: - ress = embeddings - else: - ress = np.concatenate((ress, embeddings), axis=0) + batches.append(embeddings) + ress = np.vstack(batches) if batches else np.array([]) return ress, token_count def encode_queries(self, text: str): diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index f1edd45f7a..4d56327842 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -627,17 +627,14 @@ async def embedding(docs, mdl, parser_config=None, callback=None): nonlocal mdl return mdl.encode([truncate(c, mdl.max_length - 10) for c in txts]) - cnts_ = np.array([]) + cnts_batches = [] for i in range(0, len(cnts), settings.EMBEDDING_BATCH_SIZE): async with embed_limiter: vts, c = await thread_pool_exec(batch_encode, cnts[i: i + settings.EMBEDDING_BATCH_SIZE]) - if len(cnts_) == 0: - cnts_ = vts - else: - cnts_ = np.concatenate((cnts_, vts), axis=0) + cnts_batches.append(vts) tk_count += c callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="") - cnts = cnts_ + cnts = np.vstack(cnts_batches) if cnts_batches else np.array([]) filename_embd_weight = parser_config.get("filename_embd_weight", 0.1) # due to the db support none value if not filename_embd_weight: filename_embd_weight = 0.1 @@ -720,21 +717,19 @@ async def run_dataflow(task: dict): nonlocal embedding_model return embedding_model.encode([truncate(c, embedding_model.max_length - 10) for c in txts]) - vects = np.array([]) + vects_batches = [] texts = [o.get("questions", o.get("summary", o["text"])) for o in chunks] delta = 0.20 / (len(texts) // settings.EMBEDDING_BATCH_SIZE + 1) prog = 0.8 for i in range(0, len(texts), settings.EMBEDDING_BATCH_SIZE): async with embed_limiter: vts, c = await thread_pool_exec(batch_encode, texts[i: i + settings.EMBEDDING_BATCH_SIZE]) - if len(vects) == 0: - vects = vts - else: - vects = np.concatenate((vects, vts), axis=0) + vects_batches.append(vts) embedding_token_consumption += c prog += delta if i % (len(texts) // settings.EMBEDDING_BATCH_SIZE / 100 + 1) == 1: set_progress(task_id, prog=prog, msg=f"{i + 1} / {len(texts) // settings.EMBEDDING_BATCH_SIZE}") + vects = np.vstack(vects_batches) if vects_batches else np.array([]) assert len(vects) == len(chunks) for i, ck in enumerate(chunks):