add rust batch encode as a faster option over encode

This commit is contained in:
Barış Özmen
2025-12-18 19:17:59 +03:00
parent d5759400f9
commit 790f3be65c
2 changed files with 97 additions and 0 deletions
+16
View File
@@ -465,6 +465,22 @@ impl Tokenizer {
all_ids
}
/// Encode multiple texts in parallel using rayon.
/// Returns a list of token ID vectors, one per input text.
#[pyo3(signature = (texts))]
#[pyo3(text_signature = "(self, texts)")]
pub fn batch_encode(&self, py: Python<'_>, texts: Vec<String>) -> PyResult<Vec<Vec<u32>>> {
// Release Python GIL and encode in parallel using rayon
let results = py.allow_threads(|| {
texts
.par_iter()
.map(|text| self.encode(text))
.collect::<Vec<Vec<u32>>>()
});
Ok(results)
}
}
#[pymodule]