from typing import List
class TextSplitter:
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
separators: List[str] = None,
k_range: float = 0.5,
k_ratio: float = 1,
):
"""
Split text into chunks. The logic:
- Get a piece of text with chunk_size and try to find the separator at the end of the piece.
- The allowed range to find the separator is defined by k_range and k_ratio using formula:
k_range * chunk_size / (num * k_ratio + 1)
num - is number of a separator from the list
- if the separator is not in the rage: switch to the next separator
- if the found separator is in the middle of the sentence, use overlapping:
- the found text is the current chunk
- repeat the search with less strict k_range and k_ratio
- the found text will be the beginning of the next chunk
:param chunk_size: size of the chunk, which must not be exceeded
:param separators: list of separators in order of priority
:param k_range: defines the range to look for the separator
:param k_ratio: defines how much to shrink the range for the next separator
"""
if separators is None:
separators = ["\n\n", "\n", ". ", " ", ""]
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.separators = separators
self.k_range = k_range
self.k_ratio = k_ratio
def split_text(self, text: str) -> List[str]:
chunks = []
while True:
if len(text) < self.chunk_size:
chunks.append(text)
break
sep, chunk, shift = self.get_next_chunk(text, self.k_range, self.k_ratio)
chunks.append(chunk)
text = text[shift:]
return chunks
def get_next_chunk(self, text: str, k_range: float, k_ratio: float):
chunk = text[: self.chunk_size]
for i, sep in enumerate(self.separators):
pos = chunk.rfind(sep)
vpos = self.chunk_size - pos
if vpos < k_range * self.chunk_size / (i * k_ratio + 1):
shift = len(sep) + pos
if sep.strip(" ") == "":
sep2, _, shift2 = self.get_next_chunk(text, k_range * 1.5, 0)
if sep2.strip(" ") != "":
if shift - shift2 < self.chunk_overlap:
shift = shift2
return sep, chunk[:pos], shift
raise RuntimeError("Cannot split text")