ezdata/api/web_apps/rag/cleaner/clean_processor.py-代码预览-ezdata:多源数据管理、AI交互分析与低代码任务调度平台 - AtomGit

40c47aee创建于 2025年11月24日历史提交

import re





class CleanProcessor:



    @classmethod

    def clean(cls, text: str, process_rule: dict) -> str:

        # default clean

        # remove invalid symbol

        text = re.sub(r'<\|', '<', text)

        text = re.sub(r'\|>', '>', text)

        text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)

        # Unicode  U+FFFE

        text = re.sub('\uFFFE', '', text)



        rules = process_rule['rules'] if process_rule else None

        if 'pre_processing_rules' in rules:

            pre_processing_rules = rules["pre_processing_rules"]

            for pre_processing_rule in pre_processing_rules:

                if pre_processing_rule["id"] == "remove_extra_spaces" and pre_processing_rule["enabled"] is True:

                    # Remove extra spaces

                    pattern = r'\n{3,}'

                    text = re.sub(pattern, '\n\n', text)

                    pattern = r'[\t\f\r\x20\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]{2,}'

                    text = re.sub(pattern, ' ', text)

                elif pre_processing_rule["id"] == "remove_urls_emails" and pre_processing_rule["enabled"] is True:

                    # Remove email

                    pattern = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'

                    text = re.sub(pattern, '', text)



                    # Remove URL

                    pattern = r'https?://[^\s]+'

                    text = re.sub(pattern, '', text)

        return text



    def filter_string(self, text):



        return text