b8e9a80d创建于 2025年9月18日历史提交
import mimetypes
import os
from datetime import datetime
from pathlib import Path
from typing import List, Optional


def extract_file_metadata(file_path) -> dict:
    """Extracts metadata from a single file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(file_path)

    file_metadata = {
        "file_name": os.path.basename(file_path),
        "file_path": file_path,
        "file_type": mimetypes.guess_type(file_path)[0] or "unknown",
        "file_size": os.path.getsize(file_path),
        "file_creation_date": datetime.fromtimestamp(os.path.getctime(file_path)).strftime("%Y-%m-%d"),
        "file_last_modified_date": datetime.fromtimestamp(os.path.getmtime(file_path)).strftime("%Y-%m-%d"),
    }
    return file_metadata


def extract_metadata_from_files(file_list):
    """Extracts metadata for a list of files."""
    metadata = []
    for file_path in file_list:
        file_metadata = extract_file_metadata(file_path)
        if file_metadata:
            metadata.append(file_metadata)
    return metadata


def get_filenames_in_dir(
    input_dir: str, recursive: bool = True, required_exts: Optional[List[str]] = None, exclude: Optional[List[str]] = None
):
    """
    Recursively reads files from the directory, applying required_exts and exclude filters.
    Ensures that required_exts and exclude do not overlap.

    Args:
        input_dir (str): The directory to scan for files.
        recursive (bool): Whether to scan directories recursively.
        required_exts (list): List of file extensions to include (e.g., ['pdf', 'txt']).
                             If None or empty, matches any file extension.
        exclude (list): List of file patterns to exclude (e.g., ['*png', '*jpg']).

    Returns:
        list: A list of matching file paths.
    """
    required_exts = required_exts or []
    exclude = exclude or []

    # Ensure required_exts and exclude do not overlap
    ext_set = set(required_exts)
    exclude_set = set(exclude)
    overlap = ext_set & exclude_set
    if overlap:
        raise ValueError(f"Extensions in required_exts and exclude overlap: {overlap}")

    def is_excluded(file_name):
        """Check if a file matches any pattern in the exclude list."""
        for pattern in exclude:
            if Path(file_name).match(pattern):
                return True
        return False

    files = []
    search_pattern = "**/*" if recursive else "*"

    for file_path in Path(input_dir).glob(search_pattern):
        if file_path.is_file() and not is_excluded(file_path.name):
            ext = file_path.suffix.lstrip(".")
            # If required_exts is empty, match any file
            if not required_exts or ext in required_exts:
                files.append(str(file_path))

    return files


def assert_all_files_exist_locally(file_paths: List[str]) -> bool:
    """
    Checks if all file paths in the provided list exist locally.
    Raises a FileNotFoundError with a list of missing files if any do not exist.

    Args:
        file_paths (List[str]): List of file paths to check.

    Returns:
        bool: True if all files exist, raises FileNotFoundError if any file is missing.
    """
    missing_files = [file_path for file_path in file_paths if not Path(file_path).exists()]

    if missing_files:
        raise FileNotFoundError(missing_files)

    return True