import mindspeed.megatron_adaptor
from mindspeed_mm.data.data_utils.utils import DataFileReader, TextProcesser
from tests.ut.utils import judge_expression
class TestDataFileReader:
def test_init_datafilereader(self):
get_data = DataFileReader()
judge_expression(isinstance(get_data, DataFileReader))
def test_combine_mode(self):
data_config = {
"data_path": "/home/ci_resource/data/OpenSoraPlan/v1.2/pixabay_v2/train_data.txt",
"data_storage_mode": "combine"
}
get_data = DataFileReader(data_storage_mode=data_config["data_storage_mode"])
cap_list = get_data(data_config["data_path"])
judge_expression(isinstance(cap_list, list))
def test_standard_mode(self):
data_config = {
"data_path": "/home/ci_resource/data/cogvideox1_0/data.jsonl",
"data_folder": "/home/ci_resource/data/cogvideox1_0/",
"data_storage_mode": "standard"
}
get_data = DataFileReader(data_storage_mode=data_config["data_storage_mode"])
cap_list = get_data(data_config["data_path"])
judge_expression(isinstance(cap_list, list))
class TestTextProcesser:
def test_clean_caption(self):
test_cases = [
(" Hello World ", "Hello World"),
("Hello\tWorld", "Hello World"),
("Hello\nWorld", "Hello World"),
(" Hello \t World \n ", "Hello World"),
]
for input_text, _ in test_cases:
result = TextProcesser.clean_caption(input_text)
judge_expression(isinstance(result, str))