diff --git a/.gitignore b/.gitignore index 5f3278ac9ba3ba2cd7ebc5a68204a44efe8670a9..7be8801a6799b03436074622d439ea3111019d6f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,34 +1,7 @@ - -# CSV test data (can be generated by create_sample_data.py) -*.csv - -# Great Expectations uncommitted (local execution results) -uncommitted/ -gx/uncommitted/ - -# Python __pycache__/ -*.py[cod] -*$py.class -*.so -.Python -*.egg-info/ -dist/ -build/ - -# Jupyter Notebook -.ipynb_checkpoints - -# Environment +*.pyc .env -.venv -venv/ - -# IDE -.vscode/ -.idea/ -*.swp -*.swo - -# macOS +*.xlsx~ +~$* .DS_Store +data/ diff --git a/README.md b/README.md index 3c29b33ce80f99966c5f4b5bbdec93f2d94146e2..7a14670e4d77bac78f9dfa37bf031863806a1009 100644 --- a/README.md +++ b/README.md @@ -1,238 +1,21 @@ -# ETSI Data Quality Assessment Project +# ETSI Data Quality Assessment System -Implementation of data quality metrics based on ETSI TR 104 180 standard using Great Expectations framework. +Web-based data quality validation tool based on the ETSI data quality framework. -## Overview +## Structure -This project provides a comprehensive implementation of data quality metrics defined in ETSI TR 104 180, integrating Great Expectations validation capabilities with precise ETSI formula calculations. - -## Implemented Metrics - -### 1. Completeness - ETSI 5.1 -- **Formula**: `C_D = (Non-null cells / Total cells) × 100` -- Measures the proportion of non-missing values in the dataset - -### 2. Accuracy - ETSI 5.4 -- **Formula**: `Accuracy = (Correct values / Total values) × 100` -- Validates data against specified ranges, formats, or reference values - -### 3. Consistency - ETSI 5.6 -- **Formula**: `Consistency = (Consistent values / Total values) × 100` -- Checks adherence to predefined value sets or business rules - -## Key Features - -- **Dual Validation Approach**: Combines Great Expectations validation with ETSI formula calculations -- **Comprehensive Reporting**: Detailed output showing both GX validation results and ETSI metric scores -- **Null-Inclusive Calculation**: Follows ETSI standard by including null values in total count -- **Tolerance Support**: Accuracy checks support configurable tolerance levels for numerical comparisons - -## Installation - -### Prerequisites -- Python 3.9 or higher -- Conda or virtualenv - -### Setup -```bash -# Create virtual environment -conda create -n ge_test python=3.9 -conda activate ge_test - -# Install required packages -pip install great-expectations pandas -``` - -## Usage - -### 1. Generate Test Data -```bash -python create_sample_data.py -``` - -This creates three CSV files for testing: -- `pump_data.csv` - Completeness test (with missing values) -- `products.csv` - Accuracy test (weight comparison) -- `product_categories.csv` - Consistency test (category validation) - -### 2. Run Data Quality Assessment -```bash -python data_quality_assessment.py -``` - -## Project Structure -``` -gx/ -├── data_quality_assessment.py # Main assessment script -├── create_sample_data.py # Test data generator -├── init_gx_project.py # GX project initialization -├── gx/ # Great Expectations configuration -│ ├── great_expectations.yml -│ ├── checkpoints/ -│ ├── expectations/ -│ └── plugins/ -├── README.md -└── .gitignore -``` - -## Example Output -``` -Test 1: Completeness (Pump Performance Data) -============================================================ -Dataset: pump_data.csv -Size: 4 rows × 5 columns - -=== Completeness === -[GX Validation] - Timestamp : Pass - PumpID : Pass - Temperature_C : Fail - Vibration_mm_s : Fail - Pressure_kPa : Pass - -[ETSI Calculation] - Total cells: 20 (4 rows × 5 cols) - Non-null values: 18 - Missing values: 2 - Completeness score: 90.00% - Formula: C_D = (Non-null / Total) × 100 -============================================================ - -Test 2: Accuracy (Product Weight Data) -============================================================ -=== Accuracy: ListedWeight_kg vs TrueWeight_kg === - -[ETSI Calculation] - Total products: 4 - Accurate weight: 3 - Inaccurate weight: 1 - Tolerance: ±0.1 kg - Accuracy score: 75.00% - - [Inaccurate Products Details] - PROD-B: Listed=10.2kg, True=10.5kg, Diff=0.30kg -============================================================ - -Test 3: Consistency (Product Category Data) -============================================================ -=== Consistency: Category === - -[GX Validation] - Result: Fail - Allowed values: ['Electronics', 'Furniture', 'Clothing'] - -[ETSI Calculation] - Total values: 4 - Consistent values: 3 - Inconsistent values: 1 - Consistency score: 75.00% -============================================================ -``` - -## Implementation Details - -### GX vs ETSI Calculation - -| Aspect | GX Approach | ETSI Approach | Implementation | -|:-------|:-----------|:-------------|:--------------| -| Denominator | Non-null values only | All values (including nulls) | ETSI formula applied | -| Null Handling | Excluded from calculation | Counted as missing/incorrect | Nulls included | -| Result Format | Pass/Fail + count | Percentage score | Both provided | - -### Code Architecture -```python -# Each metric function follows this pattern: - -def etsi_metric(validator, df, ...): - # 1. GX validation for Pass/Fail - result = validator.expect_column_values_to_...() - - # 2. ETSI formula calculation - total_values = len(df[column]) # Include nulls - correct_values = ... - etsi_score = (correct_values / total_values) * 100 - - # 3. Return both results - return { - "etsi_score": round(etsi_score, 2), - "gx_validation": result.success - } -``` - -## Technical References - -- **ETSI TR 104 180**: Data Quality Metrics and Scoring Methods - - Section 5.1: Completeness - - Section 5.4: Accuracy - - Section 5.6: Consistency -- **Great Expectations**: [Official Documentation](https://docs.greatexpectations.io/) - -## Configuration - -### Customize Metrics - -Edit `data_quality_assessment.py` to modify assessment parameters: -```python -# Completeness -pump_config = { - 'required_columns': ['Timestamp', 'PumpID', 'Temperature_C'] -} - -# Accuracy -products_config = { - 'accuracy_weight_comparison_check': { - 'listed_col': 'ListedWeight_kg', - 'true_col': 'TrueWeight_kg', - 'tolerance': 0.1 # Adjust tolerance - } -} - -# Consistency -category_config = { - 'consistency_checks': { - 'Category': ['Electronics', 'Furniture', 'Clothing'] - } -} -``` - -## Testing - -The project includes automated test data generation: -```bash -# Generate fresh test data -python create_sample_data.py - -# Run assessments -python data_quality_assessment.py -``` - -## Data Quality Metrics Summary - -| Metric | ETSI Formula | GX Support | Custom Implementation | -|:-------|:------------|:-----------|:---------------------| -| Completeness | `C = Non-null / Total × 100` | Partial | Full | -| Accuracy | `Acc = Correct / Total × 100` | Partial | Full | -| Consistency | `CS = Consistent / Total × 100` | Partial | Full | - -**Legend**: -- Full: Complete ETSI formula implementation -- Partial: GX provides validation, custom code adds ETSI calculation - -## Contributing - -Contributions are welcome. Please submit a Pull Request with detailed description of changes. - -## Author - -Jayeon Pyo, JaeSeung Song - -## Acknowledgments - -- ETSI for defining comprehensive data quality standards -- Great Expectations team for the data validation framework - ---- - -**Note**: CSV files are excluded from version control. Generate test data using `create_sample_data.py` before running assessments. +- `etsi_dq/` — Core analysis library (6 metrics) +- `api/` — FastAPI backend (14 REST API endpoints) +- `dashboard.py` — Streamlit frontend +## Run +Backend +uvicorn api.main:app --reload --port 8000 +Frontend +streamlit run dashboard.py +## Tech Stack +- Frontend: Streamlit +- Backend: FastAPI + Uvicorn +- Analysis: Python, pandas, numpy +- API Docs: http://localhost:8000/docs diff --git a/plugins/__init__.py b/api/__init__.py similarity index 100% rename from plugins/__init__.py rename to api/__init__.py diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000000000000000000000000000000000000..172a4bd256fa3cd9831ad95157589ceb5836042a --- /dev/null +++ b/api/main.py @@ -0,0 +1,86 @@ +""" +ETSI DQ - FastAPI Backend +========================= + +프로젝트 구조: + +etsi_dq/ ← 기존 코드 (그대로 유지) +├── metrics/ +│ ├── completeness.py +│ ├── accuracy.py +│ ├── consistency.py +│ └── ... +├── pipeline.py +├── schemas.py +├── profiler.py +├── io.py +└── ... + +api/ ← 새로 추가하는 백엔드 계층 +├── __init__.py +├── main.py ← FastAPI 앱 진입점 (이 파일) +├── routers/ +│ ├── __init__.py +│ ├── datasets.py ← 데이터셋 업로드/조회 API +│ ├── metrics.py ← 지표 설정/pre-check API +│ ├── analysis.py ← 분석 실행 API +│ └── results.py ← 결과 조회 API +├── schemas/ +│ ├── __init__.py +│ ├── datasets.py ← 데이터셋 관련 요청/응답 모델 +│ ├── metrics.py ← 지표 설정 관련 모델 +│ ├── analysis.py ← 분석 요청/응답 모델 +│ └── results.py ← 결과 응답 모델 +└── services/ + ├── __init__.py + ├── dataset_service.py ← 기존 io.py 함수를 감싸는 서비스 + ├── metric_service.py ← 기존 precheck/설정 로직 감싸는 서비스 + └── analysis_service.py ← 기존 pipeline.py의 run_analysis 감싸는 서비스 + +실행 방법: + uvicorn api.main:app --reload --port 8000 + +API 문서 자동 생성: + http://localhost:8000/docs (Swagger UI) + http://localhost:8000/redoc (ReDoc) +""" + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from api.routers import datasets, metrics, analysis, results + + +app = FastAPI( + title="ETSI Data Quality API", + description="ETSI 데이터 품질 평가 시스템 백엔드 API", + version="0.1.0", +) + +# CORS 설정 - Streamlit(8501) 또는 React(3000)에서 접근 허용 +app.add_middleware( + CORSMiddleware, + allow_origins=[ + "http://localhost:8501", # Streamlit 기본 포트 + "http://localhost:3000", # React 개발 서버 포트 (나중에) + ], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# 라우터 등록 +app.include_router(datasets.router, prefix="/api/datasets", tags=["datasets"]) +app.include_router(metrics.router, prefix="/api/metrics", tags=["metrics"]) +app.include_router(analysis.router, prefix="/api/analysis", tags=["analysis"]) +app.include_router(results.router, prefix="/api/results", tags=["results"]) + + +@app.get("/") +def root(): + return {"message": "ETSI Data Quality API", "version": "0.1.0"} + + +@app.get("/health") +def health_check(): + return {"status": "ok"} diff --git a/api/routers/__init__.py b/api/routers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/api/routers/analysis.py b/api/routers/analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..8eb4c8ff7801d97aa45947a6383aa7b6624bd91c --- /dev/null +++ b/api/routers/analysis.py @@ -0,0 +1,45 @@ +"""분석 실행 API 엔드포인트""" + +from fastapi import APIRouter, HTTPException +from api.schemas.analysis import AnalysisRunRequest, AnalysisRunResponse +from api.services import analysis_service +router = APIRouter() +service = analysis_service + + +@router.post("/run", response_model=AnalysisRunResponse) +async def run_analysis(request: AnalysisRunRequest): + """ + 품질 분석 실행 + + 선택된 지표와 설정값을 바탕으로 분석 수행. + 기존 코드의 run_analysis() 함수를 내부적으로 호출. + + 요청 예시: + { + "dataset_id": "abc123", + "metrics": ["completeness", "accuracy"], + "config": { + "completeness": { + "required_columns": ["Na", "Al", "K", "Si"], + "missing_values": ["?", "N/A", "null"], + "granularity": "dataset" + }, + "accuracy": { + "rules": [ + { + "measured_column": "RI", + "method": "statistical_bounds", + "k_sigma": 3.0, + "center": "mean" + } + ] + } + } + } + """ + try: + result = await service.execute(request) + return result + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/api/routers/datasets.py b/api/routers/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..88c47a11b5d85194e4d86a2641ab8d8d1f6d9239 --- /dev/null +++ b/api/routers/datasets.py @@ -0,0 +1,59 @@ +"""데이터셋 관리 API 엔드포인트""" + +from fastapi import APIRouter, UploadFile, File, HTTPException +from api.schemas.datasets import DatasetUploadResponse, DatasetInfo, DatasetPreview +from api.services import dataset_service + +router = APIRouter() +service = dataset_service + + +@router.post("/upload", response_model=DatasetUploadResponse) +async def upload_dataset(file: UploadFile = File(...)): + """ + 데이터셋 업로드 + + 지원 형식: CSV, XLSX, XLS, ZIP + 기존 코드의 load_dataset() 함수를 내부적으로 호출 + """ + try: + result = await service.upload_and_parse(file) + return result + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post("/reference/upload", response_model=DatasetUploadResponse) +async def upload_reference_dataset(file: UploadFile = File(...)): + """ + 참조 데이터셋 업로드 (Accuracy, Consistency에서 사용) + """ + try: + result = await service.upload_and_parse(file, is_reference=True) + return result + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get("/", response_model=list[DatasetInfo]) +async def list_datasets(): + """업로드된 데이터셋 목록 조회""" + return service.list_datasets() + + +@router.get("/{dataset_id}", response_model=DatasetInfo) +async def get_dataset(dataset_id: str): + """특정 데이터셋 정보 조회""" + result = service.get_dataset(dataset_id) + if not result: + raise HTTPException(status_code=404, detail="Dataset not found") + return result + + +@router.get("/{dataset_id}/preview", response_model=DatasetPreview) +async def preview_dataset(dataset_id: str, rows: int = 10): + """데이터셋 미리보기 (상위 N행)""" + result = service.get_preview(dataset_id, rows) + if not result: + raise HTTPException(status_code=404, detail="Dataset not found") + return result diff --git a/api/routers/metrics.py b/api/routers/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..bb736191141ab75e8d19120728fb90eba41f5cc0 --- /dev/null +++ b/api/routers/metrics.py @@ -0,0 +1,91 @@ +"""지표 설정 및 Pre-check API 엔드포인트""" + +from fastapi import APIRouter, HTTPException +from api.schemas.metrics import ( + PrecheckRequest, PrecheckResponse, + MetricConfigRequest +) +from api.services import metric_service + +router = APIRouter() +service = metric_service + + +@router.post("/precheck", response_model=PrecheckResponse) +async def run_precheck(request: PrecheckRequest): + """ + 데이터셋 기반 지표별 pre-check 수행 + + 각 지표가 현재 데이터셋으로 평가 가능한지 판단: + - Satisfied: 바로 계산 가능 + - Configuration Required: 추가 설정 필요 + - Missing Required Input: 입력 부족 + + 기존 코드의 build_metric_precheck() 함수를 내부적으로 호출 + """ + try: + return service.run_precheck(request.dataset_id) + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post("/configure") +async def configure_metrics(config: MetricConfigRequest): + """ + 지표별 설정 저장 + + 사용자가 선택한 지표와 설정값을 저장. + 이후 /api/analysis/run에서 이 설정을 사용하여 분석 실행. + """ + try: + return service.save_config(config) + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) + + +@router.get("/") +async def list_metrics(): + """지원하는 지표 목록 조회""" + return { + "metrics": [ + { + "name": "completeness", + "description": "필요한 정보가 충분히 존재하는지 평가", + "requires_config": False, + }, + { + "name": "accuracy", + "description": "데이터 값이 실제 값과 얼마나 일치하는지 평가", + "requires_config": True, + }, + { + "name": "consistency", + "description": "데이터가 정의된 규칙이나 형식을 준수하는지 평가", + "requires_config": True, + }, + { + "name": "timeliness", + "description": "데이터가 적시에 수집/처리되었는지 평가", + "requires_config": True, + }, + { + "name": "reliability", + "description": "데이터의 신뢰성 및 이상치 여부 평가", + "requires_config": False, + }, + { + "name": "uniqueness", + "description": "데이터 중복 여부 평가", + "requires_config": False, + }, + ] + } + + +@router.get("/{metric_name}/config-schema") +async def get_metric_config_schema(metric_name: str): + """특정 지표의 설정 스키마 조회""" + schema = service.get_config_schema(metric_name) + if not schema: + raise HTTPException(status_code=404, detail=f"Unknown metric: {metric_name}") + return schema diff --git a/api/routers/results.py b/api/routers/results.py new file mode 100644 index 0000000000000000000000000000000000000000..a7930ab0826d01bfbb6ea11071042c5986a125b6 --- /dev/null +++ b/api/routers/results.py @@ -0,0 +1,59 @@ +"""결과 조회 API 엔드포인트""" + +from fastapi import APIRouter, HTTPException +from api.schemas.analysis import AnalysisSummaryResponse, MetricDetailResponse +from api.services import analysis_service + +router = APIRouter() +service = analysis_service + + +@router.get("/{analysis_id}/summary", response_model=AnalysisSummaryResponse) +async def get_analysis_summary(analysis_id: str): + """ + 분석 결과 요약 조회 + + 모든 평가된 지표의 점수, 상태, 요약 정보 반환. + N/A인 지표는 사유를 함께 표시. + """ + result = service.get_summary(analysis_id) + if not result: + raise HTTPException(status_code=404, detail="Analysis not found") + return result + + +@router.get("/{analysis_id}/details/{metric}", response_model=MetricDetailResponse) +async def get_metric_detail(analysis_id: str, metric: str): + """ + 개별 지표 상세 결과 조회 + + 지표별 세부 데이터, 규칙별 결과, 샘플 데이터 반환. + """ + result = service.get_metric_detail(analysis_id, metric) + if not result: + raise HTTPException(status_code=404, detail="Result not found") + return result + + +@router.get("/{analysis_id}/rules/{metric}") +async def get_metric_rules(analysis_id: str, metric: str): + """ + 지표별 규칙 결과 조회 (Accuracy, Consistency) + """ + result = service.get_rule_results(analysis_id, metric) + if not result: + raise HTTPException(status_code=404, detail="Rule results not found") + return result + + +@router.get("/{analysis_id}/samples/{metric}") +async def get_metric_samples(analysis_id: str, metric: str, limit: int = 50): + """ + 지표별 샘플 데이터 조회 + + valid/invalid 또는 matching/violating 샘플 반환. + """ + result = service.get_samples(analysis_id, metric, limit) + if not result: + raise HTTPException(status_code=404, detail="Samples not found") + return result diff --git a/api/schemas/__init__.py b/api/schemas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/api/schemas/analysis.py b/api/schemas/analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..1521e86ecd4ef22b61120363c38b0765e3c2160d --- /dev/null +++ b/api/schemas/analysis.py @@ -0,0 +1,50 @@ +"""분석 실행 및 결과 관련 스키마""" + +from pydantic import BaseModel +from typing import Optional, Any + + +# ── 분석 요청/응답 ── + +class AnalysisRunRequest(BaseModel): + """분석 실행 요청 - MetricConfigRequest와 동일한 구조""" + dataset_id: str + metrics: list[str] # ["completeness", "accuracy", ...] + config: dict[str, Any] # 지표별 설정 (자유 형식) + + +class AnalysisRunResponse(BaseModel): + """분석 실행 응답""" + analysis_id: str + dataset_id: str + status: str = "completed" + metrics_evaluated: list[str] + message: str = "Analysis completed successfully" + + +# ── 결과 조회 ── + +class MetricSummary(BaseModel): + """개별 지표 요약 결과""" + metric: str + score: Optional[float] = None + status: str # "evaluated", "n/a", "error" + reason: Optional[str] = None # N/A인 경우 사유 + summary: dict[str, Any] = {} # 지표별 요약 데이터 + + +class AnalysisSummaryResponse(BaseModel): + """분석 결과 전체 요약""" + analysis_id: str + dataset_id: str + results: list[MetricSummary] + + +class MetricDetailResponse(BaseModel): + """개별 지표 상세 결과""" + analysis_id: str + metric: str + score: Optional[float] = None + detail: dict[str, Any] = {} # 지표별 상세 데이터 + rules: Optional[list[dict]] = None # 규칙별 결과 (accuracy, consistency) + samples: Optional[list[dict]] = None # 샘플 데이터 diff --git a/api/schemas/datasets.py b/api/schemas/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..f72bf7429042c6095415a9df2dca48aac551bc3a --- /dev/null +++ b/api/schemas/datasets.py @@ -0,0 +1,33 @@ +"""데이터셋 관련 요청/응답 스키마""" + +from pydantic import BaseModel +from typing import Optional + + +class DatasetInfo(BaseModel): + """데이터셋 기본 정보""" + dataset_id: str + filename: str + rows: int + columns: int + column_names: list[str] + dtypes: dict[str, str] + + +class DatasetPreview(BaseModel): + """데이터셋 미리보기""" + dataset_id: str + rows: int + columns: int + head: list[dict] # 상위 N행 데이터 + column_names: list[str] + dtypes: dict[str, str] + + +class DatasetUploadResponse(BaseModel): + """업로드 응답""" + dataset_id: str + filename: str + rows: int + columns: int + message: str = "Dataset uploaded successfully" diff --git a/api/schemas/metrics.py b/api/schemas/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..f920b0c3985e93625d207f3031044aba2518ed03 --- /dev/null +++ b/api/schemas/metrics.py @@ -0,0 +1,115 @@ +"""지표 설정 관련 요청/응답 스키마""" + +from pydantic import BaseModel +from typing import Optional, Literal +from enum import Enum + + +# ── Pre-check ── + +class PrecheckStatus(str, Enum): + SATISFIED = "satisfied" + CONFIGURATION_REQUIRED = "configuration_required" + MISSING_REQUIRED_INPUT = "missing_required_input" + + +class MetricPrecheck(BaseModel): + """개별 지표의 pre-check 결과""" + metric: str + status: PrecheckStatus + message: str + + +class PrecheckRequest(BaseModel): + """pre-check 요청""" + dataset_id: str + + +class PrecheckResponse(BaseModel): + """pre-check 응답 - 모든 지표의 상태""" + dataset_id: str + results: list[MetricPrecheck] + + +# ── Completeness 설정 ── + +class CompletenessConfig(BaseModel): + """Completeness 지표 설정""" + required_columns: list[str] = [] # 빈 리스트 = 전체 컬럼 + missing_values: list[str] = ["?", "N/A", "NA", "null", "None", "-200", "-999", "9999"] + granularity: Literal["dataset", "row", "column"] = "dataset" + + +# ── Accuracy 설정 ── + +class AccuracyRule(BaseModel): + """Accuracy 개별 규칙""" + measured_column: str + method: Literal["reference_mapping", "threshold", "statistical_bounds"] + # Reference Mapping + reference_source: Optional[Literal["column_in_dataset", "external_file"]] = None + reference_column: Optional[str] = None + join_key: Optional[str] = None + compare_mode: Optional[Literal["direct", "tolerance"]] = "direct" + tolerance_type: Optional[Literal["absolute", "relative"]] = "absolute" + tolerance: Optional[float] = 0.5 + # Threshold + min_value: Optional[float] = None + max_value: Optional[float] = None + inclusive: bool = True + # Statistical Bounds + k_sigma: Optional[float] = 3.0 + center: Optional[Literal["mean", "median"]] = "mean" + + +class AccuracyConfig(BaseModel): + """Accuracy 지표 설정""" + rules: list[AccuracyRule] + reference_dataset_id: Optional[str] = None # 외부 참조 데이터셋 + + +# ── Consistency 설정 ── + +class ConsistencyRule(BaseModel): + """Consistency 개별 규칙""" + mode: Literal["expression", "rule_builder", "format_regex", "referential_integrity"] + target_column: Optional[str] = None + # Expression + expression: Optional[str] = None + # Rule Builder + left_column: Optional[str] = None + operator: Optional[str] = None + comparison_target: Optional[str] = None + # Format/Regex + preset: Optional[str] = None # email, date, numeric_only 등 + custom_regex: Optional[str] = None + allow_empty: bool = True + # Referential Integrity + source_key: Optional[str] = None + reference_key: Optional[str] = None + + +class ConsistencyConfig(BaseModel): + """Consistency 지표 설정""" + rules: list[ConsistencyRule] + reference_dataset_id: Optional[str] = None + + +# ── Timeliness 설정 ── + +class TimelinessConfig(BaseModel): + """Timeliness 지표 설정""" + event_time_column: str + system_time_column: Optional[str] = None + sla_threshold: Optional[float] = None # 초 단위 + + +# ── 통합 설정 ── + +class MetricConfigRequest(BaseModel): + """분석에 사용할 지표 설정 통합""" + dataset_id: str + completeness: Optional[CompletenessConfig] = None + accuracy: Optional[AccuracyConfig] = None + consistency: Optional[ConsistencyConfig] = None + timeliness: Optional[TimelinessConfig] = None diff --git a/api/schemas/results.py b/api/schemas/results.py new file mode 100644 index 0000000000000000000000000000000000000000..0d0208e0dd7fde800c82b78a405292cecc2216c0 --- /dev/null +++ b/api/schemas/results.py @@ -0,0 +1,2 @@ +# 결과 관련 스키마는 api/schemas/analysis.py에 통합 +# 필요 시 이 파일에 추가 스키마를 정의 diff --git a/api/services/__init__.py b/api/services/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0a41b204e9e38fb2a6e94d87cefd33ab8ed59dcc --- /dev/null +++ b/api/services/__init__.py @@ -0,0 +1,9 @@ +# api/services/__init__.py +from api.services.dataset_service import DatasetService +from api.services.metric_service import MetricService +from api.services.analysis_service import AnalysisService + +# 전역 인스턴스 (모든 라우터에서 같은 걸 공유) +dataset_service = DatasetService() +metric_service = MetricService(dataset_service) # 주입 +analysis_service = AnalysisService(dataset_service) # 주입 \ No newline at end of file diff --git a/api/services/analysis_service.py b/api/services/analysis_service.py new file mode 100644 index 0000000000000000000000000000000000000000..fd5830ba2c2f0ef9dbec0f72a89eef15eb0f900a --- /dev/null +++ b/api/services/analysis_service.py @@ -0,0 +1,205 @@ +""" +분석 실행 서비스 - 기존 pipeline.py의 run_analysis()를 감싸는 계층 +""" + +import uuid +from typing import Optional, Any + +from api.schemas.analysis import ( + AnalysisRunRequest, AnalysisRunResponse, + AnalysisSummaryResponse, MetricSummary, + MetricDetailResponse, +) + +from etsi_dq.pipeline import run_analysis +from etsi_dq.schemas import AnalysisRequest, AnalysisResponse + + +# API 소문자 ↔ 기존 코드 대문자 매핑 +NAME_TO_LEGACY = { + "completeness": "Completeness", + "accuracy": "Accuracy", + "consistency": "Consistency", + "timeliness": "Timeliness", + "reliability": "Reliability", + "uniqueness": "Uniqueness", +} +LEGACY_TO_NAME = {v: k for k, v in NAME_TO_LEGACY.items()} + +# 지표별 detail 키 prefix +METRIC_PREFIX = { + "completeness": "comp_", + "accuracy": "acc_", + "consistency": "cons_", + "timeliness": "time_", + "reliability": "rel_", + "uniqueness": "uniq_", +} + + +class AnalysisService: + """분석 실행 및 결과 관리 서비스""" + + def __init__(self, dataset_service): + self._results: dict[str, dict] = {} + self._dataset_service = dataset_service + + async def execute(self, request: AnalysisRunRequest) -> AnalysisRunResponse: + analysis_id = str(uuid.uuid4())[:8] + + # 1. DataFrame 가져오기 + df = self._dataset_service.get_dataframe(request.dataset_id) + if df is None: + raise ValueError("Dataset not found") + + # 2. 지표 이름 변환 (소문자 → 대문자) + selected = [NAME_TO_LEGACY[m] for m in request.metrics if m in NAME_TO_LEGACY] + + # 3. 기존 AnalysisRequest 형식으로 변환 + analysis_request = AnalysisRequest( + df=df, + selected_metrics=selected, + config=request.config, + ) + + # 4. 기존 함수 실행 + response = run_analysis(analysis_request) + + # 5. 결과를 API 형식으로 변환하여 저장 + converted = self._convert_response(response, request.metrics) + self._results[analysis_id] = { + "dataset_id": request.dataset_id, + "results": converted, + } + + return AnalysisRunResponse( + analysis_id=analysis_id, + dataset_id=request.dataset_id, + status="completed", + metrics_evaluated=request.metrics, + ) + + def _convert_response(self, response: AnalysisResponse, metrics: list[str]) -> dict: + """ + 기존 AnalysisResponse → API용 per-metric dict 변환 + + 기존 구조: + response.results = {"Completeness": 100.0} (점수만) + response.config = {comp_status: "ok", comp_dataset_score: 100.0, ...} (details 합침) + + 변환 후: + {"completeness": {"score": 100.0, "status": "evaluated", "detail": {...}}} + """ + converted = {} + + for metric in metrics: + legacy_name = NAME_TO_LEGACY.get(metric) + if not legacy_name: + continue + + score = response.results.get(legacy_name) + prefix = METRIC_PREFIX.get(metric, "") + + # response.config에서 이 지표에 해당하는 키만 추출 + detail = {} + for key, value in response.config.items(): + if key.startswith(prefix): + detail[key] = value + + # status 판단 + status_key = f"{prefix}status" + raw_status = detail.get(status_key, "ok") + if raw_status == "n/a" or score is None: + status = "n/a" + else: + status = "evaluated" + + # N/A 사유 + reason_key = f"{prefix}na_reason" + reason = detail.get(reason_key) + + converted[metric] = { + "score": score, + "status": status, + "reason": reason, + "detail": detail, + } + + return converted + + def get_summary(self, analysis_id: str) -> Optional[AnalysisSummaryResponse]: + """분석 결과 요약 조회""" + stored = self._results.get(analysis_id) + if not stored: + return None + + summaries = [] + for metric, data in stored["results"].items(): + summaries.append(MetricSummary( + metric=metric, + score=data.get("score"), + status=data.get("status", "evaluated"), + reason=data.get("reason"), + summary=data.get("detail", {}), + )) + + return AnalysisSummaryResponse( + analysis_id=analysis_id, + dataset_id=stored["dataset_id"], + results=summaries, + ) + + def get_metric_detail( + self, analysis_id: str, metric: str + ) -> Optional[MetricDetailResponse]: + """개별 지표 상세 결과 조회""" + stored = self._results.get(analysis_id) + if not stored or metric not in stored["results"]: + return None + + data = stored["results"][metric] + detail = data.get("detail", {}) + + # 규칙별 결과 추출 (accuracy, consistency) + rules = None + if metric == "accuracy": + rules = detail.get("acc_rule_results") + elif metric == "consistency": + rules = detail.get("cons_rule_results") + + return MetricDetailResponse( + analysis_id=analysis_id, + metric=metric, + score=data.get("score"), + detail=detail, + rules=rules, + ) + + def get_rule_results(self, analysis_id: str, metric: str) -> Optional[list[dict]]: + """규칙별 결과 조회""" + stored = self._results.get(analysis_id) + if not stored or metric not in stored["results"]: + return None + detail = stored["results"][metric].get("detail", {}) + if metric == "accuracy": + return detail.get("acc_rule_results") + elif metric == "consistency": + return detail.get("cons_rule_results") + return None + + def get_samples( + self, analysis_id: str, metric: str, limit: int = 50 + ) -> Optional[dict]: + """샘플 데이터 조회""" + stored = self._results.get(analysis_id) + if not stored or metric not in stored["results"]: + return None + detail = stored["results"][metric].get("detail", {}) + + samples = [] + if metric == "consistency": + samples = detail.get("cons_violation_sample", []) + elif metric == "accuracy": + samples = detail.get("acc_rule_results", []) + + return {"metric": metric, "samples": samples[:limit]} \ No newline at end of file diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py new file mode 100644 index 0000000000000000000000000000000000000000..84be84dc0b13aebac6515fa8d1b04015dd5ce783 --- /dev/null +++ b/api/services/dataset_service.py @@ -0,0 +1,116 @@ +""" +데이터셋 서비스 - 기존 etsi_dq/io.py 함수를 감싸는 계층 + +기존 load_dataset() 함수를 여기서 호출하고, +결과를 API 응답 형식으로 변환 + +""" + +import uuid +import io +import pandas as pd +from typing import Optional +from fastapi import UploadFile + +from api.schemas.datasets import DatasetUploadResponse, DatasetInfo, DatasetPreview + +# TODO: 기존 코드 연결 +from etsi_dq.io import load_dataset + + +class DatasetService: + """데이터셋 업로드, 파싱, 관리 서비스""" + + def __init__(self): + # 메모리 저장소 (나중에 DB나 파일 저장소로 교체 가능) + self._datasets: dict[str, dict] = {} + + async def upload_and_parse( + self, + file: UploadFile, + is_reference: bool = False + ) -> DatasetUploadResponse: + """ + 파일 업로드 후 파싱 + + 기존 코드 흐름: + Streamlit file_uploader → load_any_dataset(file) + 새 코드 흐름: + FastAPI UploadFile → load_any_dataset(file) → 응답 반환 + """ + dataset_id = str(uuid.uuid4())[:8] + + # 파일 내용 읽기 + contents = await file.read() + filename = file.filename + + # TODO: 기존 함수 연결 + df = load_dataset(io.BytesIO(contents), filename) + if df is None: + raise ValueError("파일을 읽을 수 없습니다. 지원되는 형식인지 확인하세요.") + + # 메모리에 저장 + self._datasets[dataset_id] = { + "id": dataset_id, + "filename": filename, + "df": df, + "is_reference": is_reference, + } + + return DatasetUploadResponse( + dataset_id=dataset_id, + filename=filename, + rows=len(df), + columns=len(df.columns), + ) + + def list_datasets(self) -> list[DatasetInfo]: + """저장된 데이터셋 목록""" + results = [] + for ds_id, ds in self._datasets.items(): + if not ds.get("is_reference"): + df = ds["df"] + results.append(DatasetInfo( + dataset_id=ds_id, + filename=ds["filename"], + rows=len(df), + columns=len(df.columns), + column_names=list(df.columns), + dtypes={col: str(dtype) for col, dtype in df.dtypes.items()}, + )) + return results + + def get_dataset(self, dataset_id: str) -> Optional[DatasetInfo]: + """특정 데이터셋 정보 조회""" + ds = self._datasets.get(dataset_id) + if not ds: + return None + df = ds["df"] + return DatasetInfo( + dataset_id=dataset_id, + filename=ds["filename"], + rows=len(df), + columns=len(df.columns), + column_names=list(df.columns), + dtypes={col: str(dtype) for col, dtype in df.dtypes.items()}, + ) + + def get_preview(self, dataset_id: str, rows: int = 10) -> Optional[DatasetPreview]: + """데이터셋 미리보기""" + ds = self._datasets.get(dataset_id) + if not ds: + return None + df = ds["df"] + return DatasetPreview( + dataset_id=dataset_id, + rows=len(df), + columns=len(df.columns), + head=df.head(rows).to_dict(orient="records"), + column_names=list(df.columns), + dtypes={col: str(dtype) for col, dtype in df.dtypes.items()}, + ) + + def get_dataframe(self, dataset_id: str) -> Optional[pd.DataFrame]: + """내부용 - DataFrame 직접 반환 (서비스 간 데이터 전달용)""" + ds = self._datasets.get(dataset_id) + return ds["df"] if ds else None diff --git a/api/services/metric_service.py b/api/services/metric_service.py new file mode 100644 index 0000000000000000000000000000000000000000..3a723fc4ae6878bb7ec29624c3207eb3aecf477b --- /dev/null +++ b/api/services/metric_service.py @@ -0,0 +1,81 @@ +""" +지표 설정 서비스 - 기존 precheck/설정 로직을 감싸는 계층 + +""" + +from typing import Optional +from api.schemas.metrics import ( + PrecheckResponse, MetricPrecheck, PrecheckStatus, + MetricConfigRequest, CompletenessConfig, AccuracyConfig, + ConsistencyConfig, TimelinessConfig, +) +# TODO: 기존 코드 연결 +from etsi_dq.ui.precheck import build_metric_precheck +from etsi_dq.profiler import get_metadata + + +class MetricService: + """지표 pre-check 및 설정 관리 서비스""" + + def __init__(self, dataset_service): + self._configs: dict[str, MetricConfigRequest] = {} + self._dataset_service = dataset_service + + def run_precheck(self, dataset_id: str) -> PrecheckResponse: + df = self._dataset_service.get_dataframe(dataset_id) + if df is None: + raise ValueError("Dataset not found") + + summary = build_metric_precheck(df) + + # 기존 status 값 → API PrecheckStatus 매핑 + status_map = { + "satisfied": PrecheckStatus.SATISFIED, + "config_required": PrecheckStatus.CONFIGURATION_REQUIRED, + "missing_required_input": PrecheckStatus.MISSING_REQUIRED_INPUT, + } + + results = [] + for metric_name, info in summary.items(): + results.append(MetricPrecheck( + metric=metric_name.lower(), # "Completeness" → "completeness" + status=status_map[info["status"]], # "satisfied" → PrecheckStatus.SATISFIED + message=info["note"], # "note" → "message" + )) + + return PrecheckResponse( + dataset_id=dataset_id, + results=results, + ) + + def save_config(self, config: MetricConfigRequest) -> dict: + """지표 설정 저장""" + self._configs[config.dataset_id] = config + configured = [] + if config.completeness: + configured.append("completeness") + if config.accuracy: + configured.append("accuracy") + if config.consistency: + configured.append("consistency") + if config.timeliness: + configured.append("timeliness") + return { + "dataset_id": config.dataset_id, + "configured_metrics": configured, + "message": "Configuration saved", + } + + def get_config(self, dataset_id: str) -> Optional[MetricConfigRequest]: + """저장된 설정 조회 (analysis_service에서 사용)""" + return self._configs.get(dataset_id) + + def get_config_schema(self, metric_name: str) -> Optional[dict]: + """지표별 설정 스키마 반환""" + schemas = { + "completeness": CompletenessConfig.model_json_schema(), + "accuracy": AccuracyConfig.model_json_schema(), + "consistency": ConsistencyConfig.model_json_schema(), + "timeliness": TimelinessConfig.model_json_schema(), + } + return schemas.get(metric_name) diff --git a/checkpoints/customers_checkpoint.yml b/checkpoints/customers_checkpoint.yml deleted file mode 100644 index cb437835592e52a908c37de26bedde41a43d3617..0000000000000000000000000000000000000000 --- a/checkpoints/customers_checkpoint.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: customers_checkpoint -config_version: 1.0 -template_name: -module_name: great_expectations.checkpoint -class_name: SimpleCheckpoint -run_name_template: '%Y%m%d-%H%M%S-customers' -expectation_suite_name: -batch_request: {} -action_list: - - name: store_validation_result - action: - class_name: StoreValidationResultAction - - name: store_evaluation_params - action: - class_name: StoreEvaluationParametersAction - - name: update_data_docs - action: - class_name: UpdateDataDocsAction -evaluation_parameters: {} -runtime_configuration: {} -validations: [] -profilers: [] -ge_cloud_id: -expectation_suite_ge_cloud_id: diff --git a/checkpoints/products_checkpoint.yml b/checkpoints/products_checkpoint.yml deleted file mode 100644 index 1115a28cbbe33887ad42e73861a49652efa27669..0000000000000000000000000000000000000000 --- a/checkpoints/products_checkpoint.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: products_checkpoint -config_version: 1.0 -template_name: -module_name: great_expectations.checkpoint -class_name: SimpleCheckpoint -run_name_template: '%Y%m%d-%H%M%S-products' -expectation_suite_name: -batch_request: {} -action_list: - - name: store_validation_result - action: - class_name: StoreValidationResultAction - - name: store_evaluation_params - action: - class_name: StoreEvaluationParametersAction - - name: update_data_docs - action: - class_name: UpdateDataDocsAction -evaluation_parameters: {} -runtime_configuration: {} -validations: [] -profilers: [] -ge_cloud_id: -expectation_suite_ge_cloud_id: diff --git a/create_sample_data.py b/create_sample_data.py deleted file mode 100644 index b953c69a2ba169c58881ed2e89c9a842c8008378..0000000000000000000000000000000000000000 --- a/create_sample_data.py +++ /dev/null @@ -1,111 +0,0 @@ -# create_sample_data.py -# Generates sample data based on examples from ETSI TR 104 180. - -import pandas as pd - -print("Generating sample data for ETSI Data Quality Metrics...\n") - -# ============================================ -# 1. Completeness Example (ETSI 5.1.3) -# ============================================ -print("1. Completeness - Generating pump performance data...") - -# Pump operation data (with some missing values) -pump_data = [ - [1, "2024-06-25 10:00:00", "P001", 45.2, 1.5, 500], - [2, "2024-06-25 10:05:00", "P001", 45.8, 1.7, 505], - [3, "2024-06-25 10:10:00", "P001", 46.1, None, 510], # Missing Vibration - [4, "2024-06-25 10:15:00", "P001", None, 1.6, 508], # Missing Temperature - [5, "2024-06-25 10:20:00", "P001", 47.0, 1.8, None], # Missing Pressure -] - -df_pump = pd.DataFrame(pump_data, - columns=['Index', 'Timestamp', 'PumpID', - 'Temperature_C', 'Vibration_mm_s', 'Pressure_kPa']) -df_pump.to_csv('pump_data.csv', index=False) -print(" ✓ Created pump_data.csv (5 rows, 3 missing values)") - -# ============================================ -# 2. Accuracy Example (ETSI 5.4.3) -# ============================================ -print("\n2. Accuracy - Generating product weight data...") - -# Product weight accuracy data -product_data = [ - ["P-1001", "Laptop ABC", 1.5, 1.5, True], - ["P-1002", "Mouse XYZ", 2.3, 2.1, False], # Weight difference - ["P-1003", "Keyboard Pro", 0.5, 0.5, True], - ["P-1004", "Monitor Ultra", 8.7, 8.7, True], - ["P-1005", "Tablet Max", 1.2, 1.0, False], # Weight difference - ["P-1006", "Speaker Set", 15.0, 15.5, False], # Weight difference - ["P-1007", "Webcam HD", 3.4, 3.4, True], - ["P-1008", "Headset Pro", 0.8, 0.8, True], - ["P-1009", "Router Fast", 5.5, 5.0, False], # Weight difference - ["P-1010", "Charger Quick", 2.0, 2.0, True] -] - -df_products = pd.DataFrame(product_data, - columns=['ProductID', 'ProductName', 'ListedWeight_kg', - 'TrueWeight_kg', 'IsAccurate']) -df_products.to_csv('products.csv', index=False) -print(" ✓ Created products.csv (10 rows, 4 inaccurate values)") - -# ============================================ -# 3. Consistency Example (ETSI 5.6.3) -# ============================================ -print("\n3. Consistency - Generating product category data...") - -# Product category data (with format inconsistencies) -category_data = [ - ["P-001", "Laptop ABC", "Electronics", 999.00, 899.00, 50], - ["P-002", "Mouse XYZ", "Electronics", 25.00, 30.00, 100], # Price > DiscountPrice - ["P-003", "Keyboard Pro", "Electronics", 75.00, 65.00, 75], - ["P-004", "Monitor Ultra", "ELECTRONICS", 300.00, 280.00, 25], # Uppercase error - ["P-005", "Desk Chair", "Furniture", 250.00, 225.00, 30], - ["P-006", "Office Desk", "furniture", 450.00, 400.00, 15], # Lowercase error - ["P-007", "T-Shirt", "Clothing", 29.99, 24.99, 200], - ["P-008", "Jeans", "Apparel", 79.99, 69.99, 150], # Incorrect category - ["P-009", "Sneakers", "Clothing", 89.99, 79.99, 100], - ["P-010", "Backpack", "Accessories", 59.99, 49.99, 80] # Incorrect category -] - -df_categories = pd.DataFrame(category_data, - columns=['ProductID', 'ProductName', 'Category', - 'Price', 'DiscountPrice', 'StockQuantity']) -df_categories.to_csv('product_categories.csv', index=False) -print(" ✓ Created product_categories.csv (10 rows, 4 inconsistent values)") - -# ============================================ -# 4. Comprehensive Test Data (All issues) -# ============================================ -print("\n4. Comprehensive - Generating customer data (with all issues)...") - -customer_data = [ - [1, "John Doe", 35, "john@example.com", "2023-05-15", "Active"], - [2, "Jane Smith", None, "jane@example.com", "2023-06-20", "Active"], # Missing Age - [3, "Bob Johnson", 28, "bob@invalid", "2023-07-10", "Active"], # Email format error - [4, "Alice Lee", 45, "alice@example.com", "Inactive"], - [5, "John Doe", 35, "john@example.com", "2023-05-15", "Active"], # Duplicate row - [6, "Eve Adams", 150, "eve@example.com", "2023-09-12", "Active"], # Age out of range - [7, "Mike Brown", 22, "mike@example.com", "2023-10-01", "Pending"], - [8, "Sara White", None, None, "2023-11-18", "Active"], # Multiple missing values - [9, "Tom Green", 31, "tom@example.com", "2023-12-22", "active"], # Status inconsistency - [10, "Lily Black", 40, "lily@invalidcom", "invalid-date", "Unknown"] # Multiple errors -] - -df_customers = pd.DataFrame(customer_data, - columns=['CustomerID', 'Name', 'Age', 'Email', - 'PurchaseDate', 'Status']) -df_customers.to_csv('customers.csv', index=False) -print(" ✓ Created customers.csv (10 rows, various quality issues)") - -print("\n" + "="*60) -print("All datasets created successfully!") -print("="*60) -print("\nGenerated files:") -print(" 1. pump_data.csv - For Completeness test") -print(" 2. products.csv - For Accuracy test") -print(" 3. product_categories.csv - For Consistency test") -print(" 4. customers.csv - For Comprehensive test") -print("\nRun the test with the following command:") -print(" python data_quality_assessment.py") \ No newline at end of file diff --git a/dashboard.py b/dashboard.py new file mode 100644 index 0000000000000000000000000000000000000000..8a5384ef3ccbfba0abdc28e193a8e481c7dd2cc6 --- /dev/null +++ b/dashboard.py @@ -0,0 +1,2834 @@ +import streamlit as st +import pandas as pd +import numpy as np +import plotly.express as px +import plotly.graph_objects as go +import altair as alt +import os +import sys +from datetime import datetime, timedelta + +from etsi_dq.ui.precheck import render_precheck_section +from etsi_dq.ui.selectors import render_metric_selector +from etsi_dq.ui.settings import ( + render_basic_metric_settings, + render_timeliness_settings, +) +from etsi_dq.ui.settings_advanced import ( + render_accuracy_settings, + render_consistency_settings, +) +from etsi_dq.report import render_reports_tab +from etsi_dq.schemas import AnalysisRequest +from etsi_dq.pipeline import run_analysis +from etsi_dq.io import load_dataset +from etsi_dq.helpers.detail_helpers import build_accuracy_rule_pair_payload +from etsi_dq.helpers.dashboard_helpers import ACC_NA_BODY, ACC_NA_CTA, ACC_NA_TITLE, TIME_NA_BODY, TIME_NA_CTA, TIME_NA_TITLE +from etsi_dq.utils import ( + SENTINEL_MISSING_VALUES, + _build_missing_mask, +) +import re +import zipfile +import io +import requests +from pathlib import Path + +# --------------------------------------------------------- +# [Setup] +# --------------------------------------------------------- +current_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(current_dir) + +API_BASE_URL = os.getenv("ETSI_DQ_API_BASE_URL", "http://localhost:8000") + +st.set_page_config( + page_title="Data Quality Assessment", + layout="wide", + page_icon="🔷", + initial_sidebar_state="collapsed" +) + +# --------------------------------------------------------- +# [CSS Styling] +# --------------------------------------------------------- +st.markdown(""" + +""", unsafe_allow_html=True) + +# --------------------------------------------------------- +# [Standard-aligned N/A guidance messages] +# --------------------------------------------------------- +ACC_NA_TITLE = "Accuracy: Not Available (N/A)" +ACC_NA_BODY = ( + "Accuracy requires BOTH a Measured (Target) column and a Ground Truth (Reference) column. " + "If the dataset does not contain a reference/ground-truth field, Accuracy must be reported as N/A." +) +ACC_NA_CTA = "To enable Accuracy, select a valid Ground Truth (Reference) column in Accuracy Settings." + +TIME_NA_TITLE = "Timeliness: Not Available (N/A)" +TIME_NA_BODY = ( + "Timeliness requires BOTH an Event Time column (when the event occurred) and a System/Ingestion Time column " + "(when it was ingested/processed). If either column is missing, Timeliness must be reported as N/A." +) +TIME_NA_CTA = "To enable Timeliness, select both Event Time and System/Ingestion Time columns in Timeliness Settings." + +INGESTED_AT_WARN_TITLE = "Timeliness note: using '__ingested_at' as System Time" +INGESTED_AT_WARN_BODY = ( + "'__ingested_at' is added by this dashboard at upload time. Using it measures the delay between the event time and " + "when the file was uploaded to the dashboard (not the original system ingestion latency)." +) + +# --------------------------------------------------------- +# [State] +# --------------------------------------------------------- +if 'dataset' not in st.session_state: st.session_state['dataset'] = None +if 'results' not in st.session_state: st.session_state['results'] = {} +if 'config' not in st.session_state: st.session_state['config'] = {} +if 'filename' not in st.session_state: st.session_state['filename'] = "No File" +if 'datasets' not in st.session_state: st.session_state['datasets'] = {} +if 'selected_filename' not in st.session_state: st.session_state['selected_filename'] = None +if 'report_history' not in st.session_state: + st.session_state['report_history'] = pd.DataFrame( + columns=["Report ID", "Report Name", "Type", "Date", "Status"] + ) +if 'acc_ref_file' not in st.session_state: + st.session_state['acc_ref_file'] = None +if 'acc_ref_df' not in st.session_state: + st.session_state['acc_ref_df'] = None +if 'selected_metrics' not in st.session_state: + st.session_state['selected_metrics'] = [] + +if 'api_dataset_id' not in st.session_state: + st.session_state['api_dataset_id'] = None +if 'api_analysis_id' not in st.session_state: + st.session_state['api_analysis_id'] = None + +# --------------------------------------------------------- +# [Main Layout] +# --------------------------------------------------------- +st.title("Data Quality Assessment Dashboard") +st.caption("ETSI TR 104 180 Standard Compliance Check") + +tab1, tab2, tab3 = st.tabs(["📂 Data Upload", "📊 Analysis", "📑 Reports"]) + +# ========================================================= +# TAB 1: Data Upload & Dynamic Configuration (Opt-in Ver.) +# ========================================================= +with tab1: + st.markdown("### 📤 Data Upload & Configuration") + + # 1. 파일 업로드 섹션 (단일 파일 업로드) + uploaded_file = st.file_uploader( + "Upload your dataset (CSV, XLSX, ZIP)", + type=['csv', 'xlsx', 'xls', 'zip'], + accept_multiple_files=False + ) + + if uploaded_file: + # Load a single uploaded file using the existing core loader + try: + uploaded_file.seek(0) + file_bytes = uploaded_file.read() + df = load_dataset(io.BytesIO(file_bytes), uploaded_file.name) + if df is None: + raise ValueError("Unsupported or unreadable file format.") + except Exception as e: + st.error(f"Error loading file: {e}") + st.stop() + + st.session_state['datasets'] = {uploaded_file.name: df} + st.session_state['selected_filename'] = uploaded_file.name + st.session_state['filename'] = uploaded_file.name + st.session_state['dataset'] = df + + # 데이터 미리보기 + with st.expander("👀 Data Preview (First 5 rows)", expanded=False): + st.dataframe(df.head()) + st.caption(f"Total Rows: {len(df):,} | Total Columns: {len(df.columns)}") + + # ----------------------------------------------------- + # Pre-check: what will likely be N/A vs needs config? + # ----------------------------------------------------- + render_precheck_section(df) + st.markdown("
", unsafe_allow_html=True) + + # 2. 동적 설정(Configuration) 섹션 + st.subheader("Analysis Configuration") + st.caption("Select the quality metrics you want to evaluate. Settings will appear upon selection.") + + with st.container(border=True): + selected_metrics = render_metric_selector() + st.session_state['selected_metrics'] = selected_metrics + st.divider() + + config = {} + cols = df.columns.tolist() + config.update(render_basic_metric_settings(df, selected_metrics)) + + config.update(render_accuracy_settings(df, selected_metrics, config)) + + config.update(render_consistency_settings(df, selected_metrics, config)) + + config.update(render_timeliness_settings(df, selected_metrics)) + + st.markdown("
", unsafe_allow_html=True) + run_btn = st.button("Run Quality Analysis", type="primary", use_container_width=True, disabled=not selected_metrics) + if run_btn: + with st.spinner("Processing data based on your configuration..."): + + # External reference datasets for backend pipeline + config['acc_ref_df'] = st.session_state.get('acc_ref_df') + config['cons_ref_df'] = st.session_state.get('cons_ref_df') + + api_dataset_id = None + api_analysis_id = None + api_error = None + + # ------------------------------------------------- + # Step 1: Exercise backend API (upload -> analysis) + # ------------------------------------------------- + try: + upload_resp = requests.post( + f"{API_BASE_URL}/api/datasets/upload", + files={ + "file": ( + uploaded_file.name, + file_bytes, + getattr(uploaded_file, "type", None) or "application/octet-stream", + ) + }, + timeout=60, + ) + upload_resp.raise_for_status() + upload_json = upload_resp.json() + api_dataset_id = upload_json.get("dataset_id") + st.session_state['api_dataset_id'] = api_dataset_id + + api_payload = { + "dataset_id": api_dataset_id, + "metrics": [m.lower() for m in selected_metrics], + "config": config, + } + analysis_resp = requests.post( + f"{API_BASE_URL}/api/analysis/run", + json=api_payload, + timeout=300, + ) + analysis_resp.raise_for_status() + analysis_json = analysis_resp.json() + api_analysis_id = analysis_json.get("analysis_id") + st.session_state['api_analysis_id'] = api_analysis_id + except requests.RequestException as e: + api_error = str(e) + + # ------------------------------------------------- + # Step 2: Temporary compatibility bridge + # Keep local execution so the current Analysis tab + # can continue using st.session_state['results']. + # ------------------------------------------------- + request = AnalysisRequest( + df=df, + selected_metrics=selected_metrics, + config=config, + ) + response = run_analysis(request) + + results = response.results or {} + config.update(response.config or {}) + + for msg in (response.messages or []): + msg_text = str(msg) + if 'execution error' in msg_text.lower() or 'failed' in msg_text.lower(): + st.warning(msg_text) + elif 'n/a' in msg_text.lower(): + st.info(msg_text) + else: + st.info(msg_text) + + st.session_state['results'] = results + st.session_state['config'] = config + + if api_error: + st.warning(f"Backend API call failed. Current Analysis tab is showing locally computed results instead. API error: {api_error}") + else: + st.success("Analysis Complete! Backend API execution succeeded.") + if api_dataset_id: + st.caption(f"API dataset_id: `{api_dataset_id}`") + if api_analysis_id: + st.caption(f"API analysis_id: `{api_analysis_id}`") + + st.info("👈 Go to 'Analysis' tab to view results.") + + else: + st.info("👆 Please upload a dataset or load one from Kaggle to configure the analysis parameters.") + with st.container(border=True): + st.markdown(""" + **Supported Data Formats:** + * 📄 **CSV** (Comma Separated Values) + * 📊 **Excel** (.xlsx, .xls) + * 🗜️ **ZIP** (containing CSV/XLSX/XLS) + """) + +# ========================================================= +# TAB 2: Analysis Results (Full Deep-dive & Error Fixed) +# ========================================================= +with tab2: + # 1. 화면 전환 상태 관리 (summary / detail_comp / detail_rel / detail_uniq / detail_acc / detail_cons / detail_time) + if 'view_mode' not in st.session_state: + st.session_state['view_mode'] = 'summary' + + res = dict(st.session_state.get('results', {}) or {}) + df = st.session_state['dataset'] + cfg = st.session_state['config'] + selected_metrics = st.session_state.get('selected_metrics', []) + + api_analysis_id = st.session_state.get('api_analysis_id') + api_summary_error = None + + # --------------------------------------------------------- + # Summary-first API bridge + # If backend analysis exists, pull summary scores/status from API. + # Keep local cfg/detail data for now so the current summary cards + # can continue rendering without rewriting all detail logic yet. + # --------------------------------------------------------- + if api_analysis_id: + try: + summary_resp = requests.get( + f"{API_BASE_URL}/api/results/{api_analysis_id}/summary", + timeout=60, + ) + summary_resp.raise_for_status() + summary_json = summary_resp.json() + + api_to_legacy = { + 'completeness': 'Completeness', + 'accuracy': 'Accuracy', + 'consistency': 'Consistency', + 'timeliness': 'Timeliness', + 'reliability': 'Reliability', + 'uniqueness': 'Uniqueness', + } + na_reason_map = { + 'Completeness': 'comp_na_reason', + 'Accuracy': 'acc_na_reason', + 'Consistency': 'cons_na_reason', + 'Timeliness': 'time_na_reason', + 'Reliability': 'rel_na_reason', + 'Uniqueness': 'uniq_na_reason', + } + + api_results = {} + api_selected_metrics = [] + + for item in (summary_json.get('results') or []): + metric_name = str(item.get('metric', '')).lower() + legacy_name = api_to_legacy.get(metric_name) + if not legacy_name: + continue + + api_selected_metrics.append(legacy_name) + status = item.get('status') + score = item.get('score') + reason = item.get('reason') + + api_results[legacy_name] = None if status == 'n/a' else score + + if reason: + cfg[na_reason_map[legacy_name]] = reason + + if api_results: + res.update(api_results) + if api_selected_metrics: + selected_metrics = api_selected_metrics + + except requests.RequestException as e: + api_summary_error = str(e) + + if not res: + st.info("👈 Please run analysis in 'Data Upload' tab first.") + if api_summary_error: + st.caption(f"Summary API fetch failed: {api_summary_error}") + else: + if api_summary_error and api_analysis_id: + st.caption(f"Summary API fetch failed. Falling back to locally stored results. ({api_summary_error})") + + current_view_mode = st.session_state.get('view_mode', 'summary') + api_detail_error = None + + # --------------------------------------------------------- + # Detail-stage API bridge + # If backend analysis exists and the user is on a detail view, + # fetch the metric detail payload from the API and merge it into + # the existing local cfg/res objects. This lets the current detail + # UIs keep working while the backend becomes the source of truth. + # --------------------------------------------------------- + if api_analysis_id and current_view_mode != 'summary': + view_to_metric = { + 'detail_comp': 'completeness', + 'detail_acc': 'accuracy', + 'detail_cons': 'consistency', + 'detail_time': 'timeliness', + 'detail_rel': 'reliability', + 'detail_uniq': 'uniqueness', + } + api_to_legacy = { + 'completeness': 'Completeness', + 'accuracy': 'Accuracy', + 'consistency': 'Consistency', + 'timeliness': 'Timeliness', + 'reliability': 'Reliability', + 'uniqueness': 'Uniqueness', + } + + metric_for_view = view_to_metric.get(current_view_mode) + if metric_for_view: + try: + detail_resp = requests.get( + f"{API_BASE_URL}/api/results/{api_analysis_id}/details/{metric_for_view}", + timeout=60, + ) + detail_resp.raise_for_status() + detail_json = detail_resp.json() + + legacy_name = api_to_legacy.get(metric_for_view) + if legacy_name: + res[legacy_name] = detail_json.get('score') + + detail_payload = detail_json.get('detail') or {} + if isinstance(detail_payload, dict): + cfg.update(detail_payload) + + rules_payload = detail_json.get('rules') + samples_payload = detail_json.get('samples') + + if metric_for_view == 'accuracy' and rules_payload is not None: + cfg['acc_rule_results'] = rules_payload + if metric_for_view == 'consistency' and rules_payload is not None: + cfg['cons_rule_results'] = rules_payload + if metric_for_view == 'accuracy' and samples_payload is not None: + cfg['acc_violation_sample'] = samples_payload + if metric_for_view == 'consistency' and samples_payload is not None: + cfg['cons_violation_sample'] = samples_payload + + except requests.RequestException as e: + api_detail_error = str(e) + + if api_detail_error and current_view_mode != 'summary' and api_analysis_id: + st.caption(f"Detail API fetch failed. Falling back to locally stored detail data. ({api_detail_error})") + # --------------------------------------------------------- + # [CASE 0] Analysis Summary View (including Completeness summary card) + # --------------------------------------------------------- + if st.session_state['view_mode'] == 'summary': + # Main summary cards/expanders for all metrics + # ... (other summary cards) + + # --- Completeness Summary Card / Expander (Analysis tab summary view) --- + if 'Completeness' in res and res.get('Completeness') is not None: + sum_left, sum_right = st.columns([1.2, 1], gap="medium") + with sum_left: + with st.expander("Completeness", expanded=True): + comp_summary_table = cfg.get('comp_summary_table') or [] + used_cols = cfg.get('comp_used_cols') or [] + + comp_df = pd.DataFrame(comp_summary_table) + + # Optionally limit number of columns shown (e.g., lowest 10) + max_show = 10 + shown_df = comp_df.head(max_show).copy() if not comp_df.empty else pd.DataFrame(columns=['Column', 'Score', 'Score Label']) + + if shown_df.empty: + st.info("No completeness-by-column summary is available.") + else: + fig = px.bar( + shown_df, + x='Score', + y='Column', + orientation='h', + text=shown_df['Score Label'] if 'Score Label' in shown_df.columns else shown_df['Score'].apply(lambda x: f'{x:.1f}%'), + color='Score', + color_continuous_scale=[[0, '#ef4444'], [0.8, '#f59e0b'], [1, '#10b981']] + ) + fig.update_layout( + height=350, + showlegend=False, + coloraxis_showscale=False, + margin=dict(l=0, r=0, t=0, b=0), + xaxis_range=[0, 110] + ) + fig.update_traces(width=0.45) + st.plotly_chart(fig, use_container_width=True) + + scope_n = int(cfg.get('comp_scope_count', len(used_cols)) or 0) + shown_n = len(shown_df) + st.caption(f"Showing {shown_n} selected column(s) included in the completeness evaluation (out of {scope_n} selected column(s)).") + + st.caption("Completeness is calculated using only the columns and missing-value rules selected during configuration.") + + # View Detailed Completeness button + if st.button("View Detailed Completeness", use_container_width=True, key="btn_detail_comp_summary"): + st.session_state['view_mode'] = 'detail_comp' + st.rerun() + with sum_right: + with st.container(border=True): + st.subheader("Data Profile") + st.markdown("**Data Types**") + + dtype_counts = df.dtypes.astype(str).value_counts().reset_index() + dtype_counts.columns = ["Data Type", "Count"] + + if dtype_counts.empty: + st.info("No datatype profile is available.") + else: + fig_dtype = px.pie( + dtype_counts, + names="Data Type", + values="Count", + hole=0.6, + ) + fig_dtype.update_layout(height=360, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_dtype, use_container_width=True) + + # --- Consistency Summary Card / Expander (Analysis tab summary view) --- + if 'Consistency' in selected_metrics: + cons_left, cons_right = st.columns([1.2, 1], gap="medium") + if res.get('Consistency') is None: + cons_left, cons_right = st.columns([1.2, 1], gap="medium") + with cons_left: + with st.expander("Consistency", expanded=True): + na_reason = cfg.get('cons_na_reason') or 'No valid consistency rule could be evaluated.' + st.markdown("### Consistency") + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Consistency Score", "N/A") + with k2: + st.container(border=True).metric("Passed", "N/A") + with k3: + st.container(border=True).metric("Violations", "N/A") + st.info(f"Consistency could not be computed. Reason: {na_reason}") + st.caption("This metric was selected for analysis, but the current configuration did not produce any valid consistency result.") + with cons_right: + with st.container(border=True): + st.subheader("Consistency Profile") + st.info( + f"**Status:** `N/A`\n\n" + f"**Reason:** `{na_reason}`\n\n" + f"**Aggregation:** `Micro-average`" + ) + # Skip the normal Consistency summary rendering below + pass + else: + with cons_left: + with st.expander("Consistency", expanded=True): + cons_score = float(res.get('Consistency', 0.0)) + cons_rule_results = cfg.get('cons_rule_results') or [] + overall_cons_passed = int(cfg.get('cons_passed_total', 0) or 0) + overall_cons_violations = int(cfg.get('cons_violations_total', 0) or 0) + overall_cons_evaluated = int(cfg.get('cons_evaluated_total', overall_cons_passed + overall_cons_violations) or 0) + + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Consistency Score", f"{cons_score:.2f}%") + with k2: + st.container(border=True).metric("Passed", f"{overall_cons_passed:,}") + with k3: + st.container(border=True).metric( + "Violations", + f"{overall_cons_violations:,}", + f"{(overall_cons_violations / overall_cons_evaluated) * 100:.1f}%" if overall_cons_evaluated else "0.0%", + delta_color="inverse" + ) + + fig_cons = px.pie( + values=[overall_cons_passed, overall_cons_violations], + names=['Passed', 'Violated'], + color=['Passed', 'Violated'], + color_discrete_map={'Passed': '#10b981', 'Violated': '#ef4444'}, + hole=0.6, + ) + fig_cons.update_layout(height=320, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_cons, use_container_width=True) + + cons_columns = cfg.get('cons_columns_used') or [] + cons_modes = cfg.get('cons_modes_used') or [] + + cols_txt = ', '.join(cons_columns) if cons_columns else 'N/A' + modes_txt = ', '.join(cons_modes) if cons_modes else 'N/A' + st.caption( + f"Overall consistency summary across {len(cons_rule_results)} configured rule(s). " + f"Columns: {cols_txt} | Modes: {modes_txt} | Aggregation: micro-average." + ) + + if cons_rule_results: + st.markdown("---") + st.subheader("Consistency Rule Results") + cons_summary_table = cfg.get('cons_summary_table') or [] + cons_rules_df = pd.DataFrame(cons_summary_table) + st.dataframe(cons_rules_df, use_container_width=True) + st.caption("This table summarizes the result of each configured consistency rule.") + + if st.button("View Detailed Consistency", use_container_width=True, key="btn_detail_cons_summary"): + st.session_state['view_mode'] = 'detail_cons' + st.rerun() + + with cons_right: + with st.container(border=True): + st.subheader("Consistency Profile") + st.markdown("**Rule Summary**") + + cons_rule_count = int(cfg.get('cons_rule_count', 0) or 0) + cons_columns = cfg.get('cons_columns_used') or [] + cons_modes = cfg.get('cons_modes_used') or [] + cons_rule_lines = cfg.get('cons_rules_applied') or [] + + cols_txt = ", ".join([f"`{c}`" for c in cons_columns]) if cons_columns else "N/A" + modes_txt = ", ".join([f"`{m}`" for m in cons_modes]) if cons_modes else "N/A" + + st.info( + f"**Configured Columns:** {cols_txt}\n\n" + f"**Configured Modes:** {modes_txt}\n\n" + f"**Rule Count:** `{cons_rule_count}`\n\n" + f"**Aggregation:** `Micro-average`" + ) + + if cons_rule_lines: + st.markdown("**Applied Rules**") + for idx, line in enumerate(cons_rule_lines, start=1): + st.caption(f"Rule {idx}. {line}") + + # --- Reliability Summary Card / Expander (Analysis tab summary view) --- + if 'Reliability' in selected_metrics: + rel_left, rel_right = st.columns([1.2, 1], gap="medium") + + if res.get('Reliability') is None: + with rel_left: + with st.expander("Reliability", expanded=True): + na_reason = cfg.get('rel_na_reason') or 'No valid reliability result could be evaluated.' + st.markdown("### Reliability") + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Overall CV", "N/A") + with k2: + st.container(border=True).metric("Selected Columns", "N/A") + with k3: + st.container(border=True).metric("Valid Reliability Columns", "N/A") + st.info(f"Reliability could not be computed. Reason: {na_reason}") + st.caption("This metric was selected for analysis, but the current configuration did not produce any valid reliability result.") + with rel_right: + with st.container(border=True): + st.subheader("Reliability Profile") + st.info( + f"**Status:** `N/A`\n\n" + f"**Reason:** `{na_reason}`\n\n" + f"**Aggregation:** `Average CV across valid selected columns`" + ) + else: + with rel_left: + with st.expander("Reliability", expanded=True): + rel_score = float(res.get('Reliability', 0.0)) + rel_results = cfg.get('rel_results') or [] + rel_selected_columns = cfg.get('rel_selected_columns') or [] + rel_overall_cv = float(cfg.get('rel_overall_cv', rel_score) or rel_score) + rel_valid_column_count = int(cfg.get('rel_valid_column_count', 0) or 0) + + representative_col = cfg.get('rel_representative_column') or 'N/A' + representative_cv = cfg.get('rel_representative_cv', None) + representative_level = cfg.get('rel_representative_level') or 'N/A' + representative_warning = cfg.get('rel_representative_warning') + + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Overall CV", f"{rel_overall_cv:.2f}%") + with k2: + st.container(border=True).metric("Selected Columns", f"{len(rel_selected_columns):,}") + with k3: + st.container(border=True).metric("Valid Reliability Columns", f"{rel_valid_column_count:,}") + + rel_summary_table = cfg.get('rel_summary_table') or [] + if rel_summary_table: + show_df = pd.DataFrame(rel_summary_table) + st.dataframe(show_df, use_container_width=True) + st.caption("Reliability is summarized using the average CV across selected numeric columns that produced valid results.") + else: + st.info("No valid numeric columns produced a reliability result.") + + if st.button("View Detailed Reliability", use_container_width=True, key="btn_detail_rel_summary"): + st.session_state['view_mode'] = 'detail_rel' + st.rerun() + + with rel_right: + with st.container(border=True): + st.subheader("Reliability Profile") + + if representative_cv is None: + st.info( + "**Representative Column:** `N/A`\n\n" + "**Representative CV:** `N/A`\n\n" + "**Reliability Level:** `N/A`" + ) + else: + profile_text = ( + f"**Representative Column:** `{representative_col}`\n\n" + f"**Representative CV:** `{float(representative_cv):.2f}%`\n\n" + f"**Reliability Level:** `{representative_level}`" + ) + if representative_warning: + profile_text += f"\n\n**Warning:** `{representative_warning}`" + st.info(profile_text) + + # --- Uniqueness Summary Card / Expander (Analysis tab summary view) --- + if 'Uniqueness' in selected_metrics: + uniq_left, uniq_right = st.columns([1.2, 1], gap="medium") + + if res.get('Uniqueness') is None: + with uniq_left: + with st.expander("Uniqueness", expanded=True): + na_reason = cfg.get('uniq_na_reason') or 'No valid uniqueness result could be evaluated.' + st.markdown("### Uniqueness") + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Uniqueness Score", "N/A") + with k2: + st.container(border=True).metric("Unique Rows", "N/A") + with k3: + st.container(border=True).metric("Duplicate Rows", "N/A") + st.info(f"Uniqueness could not be computed. Reason: {na_reason}") + st.caption("This metric was selected for analysis, but the current configuration did not produce any valid uniqueness result.") + with uniq_right: + with st.container(border=True): + st.subheader("Uniqueness Profile") + st.info( + f"**Status:** `N/A`\n\n" + f"**Reason:** `{na_reason}`" + ) + else: + uniq_score = float(res.get('Uniqueness', 0.0)) + total_rows = int(cfg.get('uniq_total_rows', 0) or 0) + duplicate_rows = int(cfg.get('uniq_duplicate_rows', 0) or 0) + unique_rows = int(cfg.get('uniq_unique_rows', 0) or 0) + duplicate_rate = float(cfg.get('uniq_duplicate_rate', 0.0) or 0.0) + uniq_key_cols = cfg.get('uniq_key_cols') or [] + uniq_profile_mode = cfg.get('uniq_profile_mode') or 'Current Dataset Only' + uniq_profile_basis = cfg.get('uniq_profile_basis') or 'Full-row duplicate check across all columns' + + with uniq_left: + with st.expander("Uniqueness", expanded=True): + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Uniqueness Score", f"{uniq_score:.2f}%") + with k2: + st.container(border=True).metric("Unique Rows", f"{unique_rows:,}") + with k3: + st.container(border=True).metric("Duplicate Rows", f"{duplicate_rows:,}", f"{duplicate_rate:.2f}%" if total_rows else "0%", delta_color="inverse") + + fig_uniq = px.pie( + values=[unique_rows, duplicate_rows], + names=['Unique', 'Duplicate'], + hole=0.6, + ) + fig_uniq.update_layout(height=320, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_uniq, use_container_width=True) + st.caption("Current-dataset uniqueness is summarized as the proportion of non-duplicate rows under the selected basis.") + + if st.button("View Detailed Uniqueness", use_container_width=True, key="btn_detail_uniq_summary"): + st.session_state['view_mode'] = 'detail_uniq' + st.rerun() + + with uniq_right: + with st.container(border=True): + st.subheader("Uniqueness Profile") + if uniq_key_cols: + st.info( + f"**Mode:** `{uniq_profile_mode}`\n\n" + f"**Basis:** `{uniq_profile_basis}`\n\n" + f"**Keys:** {', '.join([f'`{c}`' for c in uniq_key_cols])}" + ) + else: + st.info( + f"**Mode:** `{uniq_profile_mode}`\n\n" + f"**Basis:** `{uniq_profile_basis}`" + ) + + # --- Accuracy Summary Card / Expander (Analysis tab summary view) --- + if 'Accuracy' in selected_metrics: + acc_left, acc_right = st.columns([1.2, 1], gap="medium") + if res.get('Accuracy') is None: + acc_left, acc_right = st.columns([1.2, 1], gap="medium") + with acc_left: + with st.expander("Accuracy", expanded=True): + na_reason = cfg.get('acc_na_reason') or 'No valid accuracy rule could be evaluated.' + st.markdown("### Accuracy") + k1, k2, k3, k4 = st.columns(4) + with k1: + st.container(border=True).metric("Accuracy Score", "N/A") + with k2: + st.container(border=True).metric("Checked Cases", "N/A") + with k3: + st.container(border=True).metric("Accurate / In Rule", "N/A") + with k4: + st.container(border=True).metric("Coverage", "N/A") + st.info(f"Accuracy could not be computed. Reason: {na_reason}") + st.caption("This metric was selected for analysis, but the current configuration did not produce any valid accuracy result.") + with acc_right: + with st.container(border=True): + st.subheader("Accuracy Profile") + st.info( + f"**Status:** `N/A`\n\n" + f"**Reason:** `{na_reason}`\n\n" + f"**Aggregation:** `Micro-average`" + ) + # Skip the normal Accuracy summary rendering below + pass + else: + with acc_left: + with st.expander("Accuracy", expanded=True): + acc_score = float(res.get('Accuracy', 0.0)) + acc_rule_results = cfg.get('acc_rule_results') or [] + overall_acc_checked = int(cfg.get('acc_checked_total', 0) or 0) + overall_acc_accurate = int(cfg.get('acc_accurate_total', 0) or 0) + overall_acc_inaccurate = int(cfg.get('acc_inaccurate_total', 0) or 0) + overall_acc_coverage = float(cfg.get('acc_coverage', 0.0) or 0.0) + acc_methods = cfg.get('acc_methods_used') or [] + acc_columns = cfg.get('acc_columns_used') or [] + task_label = cfg.get('acc_task_label') or 'N/A' + + k1, k2, k3, k4 = st.columns(4) + with k1: + st.container(border=True).metric("Accuracy Score", f"{acc_score:.2f}%") + with k2: + st.container(border=True).metric("Checked Cases", f"{overall_acc_checked:,}") + with k3: + st.container(border=True).metric("Accurate / In Rule", f"{overall_acc_accurate:,}") + with k4: + st.container(border=True).metric("Coverage", f"{overall_acc_coverage*100:.2f}%") + + st.caption( + f"Task: {task_label} | Checked cases: {overall_acc_checked:,} | Accurate / Within rule: {overall_acc_accurate:,} | " + f"Inaccurate / Outside rule: {overall_acc_inaccurate:,} | Aggregation: micro-average" + ) + + fig_acc = px.pie( + values=[overall_acc_accurate, overall_acc_inaccurate], + names=['Within Rule', 'Outside Rule'], + hole=0.6, + ) + fig_acc.update_layout(height=320, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_acc, use_container_width=True) + + cols_txt = ', '.join(acc_columns) if acc_columns else 'N/A' + methods_txt = ', '.join(acc_methods) if acc_methods else 'N/A' + st.caption( + f"Overall accuracy summary across {len(acc_rule_results)} configured rule(s). " + f"Columns: {cols_txt} | Methods: {methods_txt} | Aggregation: micro-average." + ) + + if acc_rule_results: + st.markdown("---") + st.subheader("Accuracy Rule Results") + acc_summary_table = cfg.get('acc_summary_table') or [] + acc_rules_df = pd.DataFrame(acc_summary_table) + st.dataframe(acc_rules_df, use_container_width=True) + st.caption("This table summarizes the result of each configured accuracy rule.") + + if st.button("View Detailed Accuracy", use_container_width=True, key="btn_detail_acc_summary"): + st.session_state['view_mode'] = 'detail_acc' + st.rerun() + + with acc_right: + with st.container(border=True): + st.subheader("Accuracy Profile") + + acc_rule_count = int(cfg.get('acc_rule_count', 0) or 0) + acc_columns = cfg.get('acc_columns_used') or [] + acc_methods = cfg.get('acc_methods_used') or [] + acc_rule_lines = cfg.get('acc_rules_applied') or [] + + cols_txt = ", ".join([f"`{c}`" for c in acc_columns]) if acc_columns else "N/A" + methods_txt = ", ".join([f"`{m}`" for m in acc_methods]) if acc_methods else "N/A" + + st.info( + f"**Configured Columns:** {cols_txt}\n\n" + f"**Configured Methods:** {methods_txt}\n\n" + f"**Rule Count:** `{acc_rule_count}`\n\n" + f"**Aggregation:** `Micro-average`" + ) + + if acc_rule_lines: + st.markdown("**Applied Rules**") + for idx, line in enumerate(acc_rule_lines, start=1): + st.caption(f"Rule {idx}. {line}") + + # --- Timeliness Summary Card / Expander (Analysis tab summary view) --- + if 'Timeliness' in selected_metrics: + time_left, time_right = st.columns([1.2, 1], gap="medium") + + if res.get('Timeliness') is None: + with time_left: + with st.expander("Timeliness", expanded=True): + na_reason = cfg.get('time_na_reason') or 'No valid timeliness result could be evaluated.' + st.markdown("### Timeliness") + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Timeliness Score", "N/A") + with k2: + st.container(border=True).metric("Avg Latency (s)", "N/A") + with k3: + st.container(border=True).metric("SLA (s)", "N/A") + st.info(f"Timeliness could not be computed. Reason: {na_reason}") + st.caption("This metric was selected for analysis, but the current configuration did not produce any valid timeliness result.") + with time_right: + with st.container(border=True): + st.subheader("Timeliness Profile") + st.info( + f"**Status:** `N/A`\n\n" + f"**Reason:** `{na_reason}`" + ) + else: + with time_left: + with st.expander("Timeliness", expanded=True): + time_score = float(res.get('Timeliness', 0.0)) + avg_lat = cfg.get('t_avg_latency', None) + p95_lat = cfg.get('t_p95_latency', None) + sla = float(cfg.get('t_sla_seconds', 0.0) or 0.0) + event_col = cfg.get('t_event', 'N/A') + valid_pairs = int(cfg.get('t_pairs_valid', 0) or 0) + negative_pairs = int(cfg.get('t_negative_latency', 0) or 0) + profile_mode = cfg.get('t_profile_mode') or 'System Time Column' + profile_reference_label = cfg.get('t_profile_reference_label') or 'System Time' + profile_reference_value = cfg.get('t_profile_reference_value') or 'N/A' + profile_event_col = cfg.get('t_profile_event_col') or event_col + profile_p95_latency = cfg.get('t_profile_p95_latency', p95_lat) + + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Timeliness Score", f"{time_score:.2f}%") + with k2: + st.container(border=True).metric("Avg Latency (s)", f"{float(avg_lat):.4f}" if avg_lat is not None else 'N/A') + with k3: + st.container(border=True).metric("SLA (s)", f"{sla:.4f}") + + k4, k5 = st.columns(2) + with k4: + st.container(border=True).metric("Valid Pairs", f"{valid_pairs:,}") + with k5: + st.container(border=True).metric("Negative Pairs", f"{negative_pairs:,}") + + vals = [] + names = [] + if avg_lat is not None: + vals.append(float(avg_lat)) + names.append('Average') + if p95_lat is not None: + vals.append(float(p95_lat)) + names.append('P95') + vals.append(float(sla)) + names.append('SLA') + + fig_time = px.bar(x=names, y=vals, labels={'x': 'Metric', 'y': 'Seconds'}) + fig_time.update_layout(height=320, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_time, use_container_width=True) + st.caption("Timeliness summary compares average / p95 latency against the configured SLA.") + + if st.button("View Detailed Timeliness", use_container_width=True, key="btn_detail_time_summary"): + st.session_state['view_mode'] = 'detail_time' + st.rerun() + + with time_right: + with st.container(border=True): + st.subheader("Timeliness Profile") + p95_text = f"{float(profile_p95_latency):.4f}" if profile_p95_latency is not None else 'N/A' + st.info( + f"**Reference Mode:** `{profile_mode}`\n\n" + f"**Event Time:** `{profile_event_col}`\n\n" + f"**{profile_reference_label}:** `{profile_reference_value}`\n\n" + f"**P95 Latency:** `{p95_text}`" + ) + + if not any(metric in selected_metrics for metric in ['Completeness', 'Reliability', 'Uniqueness', 'Accuracy', 'Consistency', 'Timeliness']): + st.info("No summary panels are available for the currently selected metric(s) yet. Please open a detailed view if available.") + # --------------------------------------------------------- + # [CASE 1] Completeness 상세 분석 (고도화 버전) + # --------------------------------------------------------- + elif st.session_state['view_mode'] == 'detail_comp': + # 상단 네비게이션 + col_nav, col_empty = st.columns([1, 5]) + with col_nav: + if st.button("⬅️ Back to Analysis", use_container_width=True): + st.session_state['view_mode'] = 'summary'; st.rerun() + + # 헤더 섹션 (이미지 42079e 스타일 반영) + comp_score = res.get('Completeness', 0) + st.markdown(f""" +
+

Completeness Analysis

+ {comp_score:.1f}% + +
+

Completeness measures the degree to which all required data values are present, based on the selected required columns, missing-value definitions, and granularity.

+ """, unsafe_allow_html=True) + + gauge_col, gauge_info_col = st.columns([1.2, 1], gap="medium") + with gauge_col: + fig_gauge = go.Figure(go.Indicator( + mode="gauge+number", + value=float(comp_score), + number={'suffix': "%", 'valueformat': '.1f'}, + title={'text': "Overall Completeness Score"}, + gauge={ + 'axis': {'range': [0, 100]}, + 'bar': {'color': "#10b981"}, + 'steps': [ + {'range': [0, 60], 'color': "#fee2e2"}, + {'range': [60, 85], 'color': "#fef3c7"}, + {'range': [85, 100], 'color': "#dcfce7"}, + ], + 'threshold': { + 'line': {'color': "#111827", 'width': 4}, + 'thickness': 0.75, + 'value': float(comp_score) + } + } + )) + fig_gauge.update_layout(height=280, margin=dict(l=20, r=20, t=60, b=20)) + st.plotly_chart(fig_gauge, use_container_width=True) + + with gauge_info_col: + st.subheader("Score Interpretation") + if float(comp_score) >= 95: + st.success("Excellent completeness: most required values are present.") + elif float(comp_score) >= 80: + st.warning("Moderate completeness: some required values are missing and should be reviewed.") + else: + st.error("Low completeness: missing required values are materially affecting dataset quality.") + + st.caption( + "This overall score reflects the completeness result calculated from the selected required columns, missing-value definitions, and granularity." + ) + + st.markdown("---") + + # 1. KPI 카드 섹션 (이미지 42079e 상단 카드 레이아웃) + kpi1, kpi2, kpi3, kpi4 = st.columns(4) + total_recs = int(cfg.get('comp_total_records', len(df)) or 0) + used_cols = cfg.get('comp_used_cols') or [] + scope_count = int(cfg.get('comp_scope_count', len(used_cols)) or 0) + + missing_cells = int(cfg.get('comp_missing_cells', 0) or 0) + complete_recs = int(cfg.get('comp_complete_records', cfg.get('comp_complete_rows', 0)) or 0) + incomplete_recs = int(cfg.get('comp_incomplete_records', cfg.get('comp_incomplete_rows', max(total_recs - complete_recs, 0))) or 0) + comp_pct = float(cfg.get('comp_complete_pct', (complete_recs / total_recs * 100.0) if total_recs else 0.0) or 0.0) + incomp_pct = float(cfg.get('comp_incomplete_pct', (incomplete_recs / total_recs * 100.0) if total_recs else 0.0) or 0.0) + + with kpi1: + st.container(border=True).metric("Total Records", f"{total_recs:,}") + with kpi2: + st.container(border=True).metric("Complete Records", f"{complete_recs:,}", f"{comp_pct:.1f}%") + with kpi3: + st.container(border=True).metric("Incomplete Records", f"{incomplete_recs:,}", f"-{incomp_pct:.1f}%", delta_color="inverse") + with kpi4: + st.container(border=True).metric("Missing Cells", f"{missing_cells:,}") + + # 2. 메인 분석 영역 (좌: 컬럼별/트렌드, 우: 이슈/액션) + col_left, col_right = st.columns([2, 1], gap="medium") + + with col_left: + st.subheader("Missing Values Heatmap") + + heatmap_columns = cfg.get('comp_heatmap_columns') or [] + heatmap_data = cfg.get('comp_heatmap_data') or [] + heatmap_df = pd.DataFrame(heatmap_data, columns=heatmap_columns) if heatmap_columns and heatmap_data else pd.DataFrame() + + if heatmap_df.shape[0] == 0 or heatmap_df.shape[1] == 0: + st.info("No data available to render the missing-values heatmap.") + else: + fig_heat = px.imshow( + heatmap_df, + aspect="auto", + color_continuous_scale=[[0, "#10b981"], [1, "#ef4444"]], + labels={"x": "Columns", "y": "Row Index", "color": "Missing"}, + title="Missingness Pattern (Top 100 Rows)", + ) + fig_heat.update_layout(height=420, margin=dict(l=0, r=0, t=50, b=0)) + st.plotly_chart(fig_heat, use_container_width=True) + st.caption("Green = present, Red = missing. Displaying the first 100 rows only for readability.") + st.caption("The heatmap reflects the selected completeness scope only, using the required columns and missing-value rules defined during configuration.") + + # 컬럼별 완전성 (이미지 42079e 좌측 바 차트) + st.subheader("Completeness by Column") + comp_detail_scores = cfg.get('comp_detail_column_scores') or [] + comp_df = pd.DataFrame(comp_detail_scores) + + if comp_df.empty: + st.info("No column-level completeness information is available.") + else: + fig_cols = px.bar( + comp_df, + x='Score', + y='Column', + orientation='h', + text=comp_df['Score Label'] if 'Score Label' in comp_df.columns else comp_df['Score'].apply(lambda x: f'{x:.1f}%'), + color='Score', + color_continuous_scale=[[0, '#ef4444'], [0.8, '#f59e0b'], [1, '#10b981']] + ) + fig_cols.update_layout(height=400, showlegend=False, coloraxis_showscale=False, + margin=dict(l=0, r=0, t=0, b=0), xaxis_range=[0, 110]) + st.plotly_chart(fig_cols, use_container_width=True) + st.caption("Columns with lower completeness scores appear first so high-risk fields are easier to identify.") + + st.subheader("Missing Count by Column") + miss_count_df = pd.DataFrame(cfg.get('comp_missing_count_table') or []) + + if miss_count_df.empty: + st.info("No column-level missing-count information is available.") + else: + fig_missing_cnt = px.bar( + miss_count_df, + x='Missing Count', + y='Column', + orientation='h', + text='Missing Count', + color='Missing Count', + color_continuous_scale=[[0, '#10b981'], [0.5, '#f59e0b'], [1, '#ef4444']] + ) + fig_missing_cnt.update_layout( + height=300, + margin=dict(l=0, r=0, t=10, b=0), + coloraxis_showscale=False + ) + st.plotly_chart(fig_missing_cnt, use_container_width=True) + st.caption("This chart shows how many missing cells were found in each selected column.") + + with col_right: + gran = cfg.get('comp_granularity_used', cfg.get('comp_granularity', 'Dataset')) + dataset_score = float(cfg.get('comp_dataset_score', comp_score)) + row_score = float(cfg.get('comp_row_score', comp_score)) + col_scores_dict = cfg.get('comp_col_scores', {}) or {} + column_score = float(np.mean(list(col_scores_dict.values()))) if len(col_scores_dict) > 0 else float(comp_score) + + if gran == 'Row': + primary_label = 'Primary Completeness Score (Row)' + primary_score = row_score + elif gran == 'Column': + primary_label = 'Primary Completeness Score (Column)' + primary_score = column_score + else: + primary_label = 'Primary Completeness Score (Dataset)' + primary_score = dataset_score + + st.container(border=True).metric(primary_label, f"{primary_score:.2f}%") + st.caption("This highlighted score reflects the completeness granularity selected during configuration.") + st.caption(f"Selected completeness scope: {scope_count} column(s) are included in this evaluation.") + st.markdown("---") + + st.subheader("Metric Details") + + total_rows = int(cfg.get('comp_total_records', total_recs) or 0) + used_cols = cfg.get('comp_used_cols') or [] + # gran, dataset_score, row_score, col_scores_dict, column_score already defined above + missing_cells = int(cfg.get('comp_missing_cells', 0) or 0) + complete_rows = int(cfg.get('comp_complete_records', cfg.get('comp_complete_rows', 0)) or 0) + incomplete_rows = int(cfg.get('comp_incomplete_records', cfg.get('comp_incomplete_rows', max(total_rows - complete_rows, 0))) or 0) + + d1, d2 = st.columns(2) + with d1: + st.container(border=True).metric("Dataset Score", f"{dataset_score:.2f}%") + with d2: + st.container(border=True).metric("Row Score", f"{row_score:.2f}%") + + d3, d4 = st.columns(2) + with d3: + st.container(border=True).metric("Column Score", f"{column_score:.2f}%") + with d4: + st.container(border=True).metric("Selected Granularity", str(gran)) + + d5, d6 = st.columns(2) + with d5: + st.container(border=True).metric("Complete Records", f"{complete_rows:,}") + with d6: + st.container(border=True).metric("Incomplete Records", f"{incomplete_rows:,}") + + st.container(border=True).metric("Missing Cells", f"{missing_cells:,}") + st.caption("These metrics summarize completeness using the selected required columns and missing-value definitions.") + + st.markdown("---") + st.subheader("Issue Reports & Actions") + + comp_issue_reports = cfg.get('comp_issue_reports') or [] + + if not comp_issue_reports: + st.success("No completeness issues detected in the selected scope.") + else: + for issue in comp_issue_reports: + col_name = issue.get('column', 'N/A') + miss_pct = float(issue.get('missing_rate', 0.0) or 0.0) + severity = issue.get('severity', 'Low') + action = issue.get('action', 'Monitor this column and validate whether the missingness is expected or structural.') + + st.warning( + f"[{severity}] Column `{col_name}` has {miss_pct:.1f}% missing values. " + f"Recommended action: {action}" + ) + st.caption(f"Issue summary: `{issue.get('summary', f'{col_name} currently shows a {miss_pct:.1f}% missing-value rate in the selected completeness scope.')}`") + + st.caption("Issue reports are based on the selected required columns and missing-value rules.") + + # --------------------------------------------------------- + # [CASE 1.5] Reliability 상세 분석 + # --------------------------------------------------------- + elif st.session_state['view_mode'] == 'detail_rel': + col_back, col_title = st.columns([1, 5]) + with col_back: + if st.button("⬅️ Back to Analysis", use_container_width=True): + st.session_state['view_mode'] = 'summary'; st.rerun() + + rel_score = res.get('Reliability', None) + rel_status = cfg.get('rel_status', 'ok') + rel_na_reason = cfg.get('rel_na_reason') + rel_results = cfg.get('rel_results') or [] + rel_selected_columns = cfg.get('rel_selected_columns') or [] + rel_overall_cv = cfg.get('rel_overall_cv', rel_score) + rel_valid_column_count = int(cfg.get('rel_valid_column_count', 0) or 0) + rel_configured_results_table = cfg.get('rel_configured_results_table') or [] + rel_detail_selector_options = cfg.get('rel_detail_selector_options') or [] + rel_detail_results = cfg.get('rel_detail_results') or [] + + if rel_score is None or rel_status == 'n/a' or not rel_results: + st.markdown(""" +
+

Reliability Analysis

+ N/A +
+

Reliability requires one or more numeric columns.

+ """, unsafe_allow_html=True) + st.info(f"Reliability = N/A. {rel_na_reason or 'Please select at least one numeric column in Reliability Settings.'}") + st.stop() + + st.markdown(f""" +
+

Reliability Analysis

+ {float(rel_overall_cv):.2f}% + +
+

Reliability is evaluated using the Coefficient of Variation (CV) = (std / mean) * 100 for each selected numeric column.

+ """, unsafe_allow_html=True) + st.markdown("---") + + k1, k2, k3 = st.columns(3) + with k1: + st.container(border=True).metric("Overall CV", f"{float(rel_overall_cv):.2f}%") + with k2: + st.container(border=True).metric("Selected Columns", f"{len(rel_selected_columns):,}") + with k3: + st.container(border=True).metric("Valid Reliability Columns", f"{rel_valid_column_count:,}") + + st.caption( + "Overall Reliability is summarized as the average CV across the selected columns that produced valid reliability results." + ) + st.markdown("---") + + rel_df = pd.DataFrame(rel_configured_results_table) + if not rel_df.empty: + st.subheader("Configured Reliability Results") + st.dataframe(rel_df, use_container_width=True) + st.markdown("---") + + if not rel_detail_results: + st.info("No valid numeric columns produced a Reliability result.") + st.stop() + + selected_label = st.selectbox( + "Select Reliability Result to View", + rel_detail_selector_options, + index=0, + key="detail_rel_selector", + help="Choose which configured reliability result to inspect in detail." + ) + selected_rel = next((r for r in rel_detail_results if r.get('label') == selected_label), rel_detail_results[0]) + + rel_col = selected_rel.get('column') + rel_cv = float(selected_rel.get('cv', 0.0) or 0.0) + rel_valid = int(selected_rel.get('valid_count', 0) or 0) + rel_mean = selected_rel.get('mean', None) + rel_std = selected_rel.get('std', None) + rel_level = selected_rel.get('level', 'N/A') + rel_warning = selected_rel.get('warning') + rel_high_max = float(selected_rel.get('high_max', 5.0) or 5.0) + rel_moderate_max = float(selected_rel.get('moderate_max', 15.0) or 15.0) + histogram_values = selected_rel.get('histogram_values') or [] + rolling_window = selected_rel.get('rolling_window') + rolling_cv_points = selected_rel.get('rolling_cv_points') or [] + + st.caption( + f"Showing detail for column `{rel_col}` | CV {rel_cv:.2f}% | {rel_level}" + ) + st.markdown("---") + + + k1, k2, k3, k4 = st.columns(4) + with k1: + st.container(border=True).metric("Selected Column", f"{rel_col}") + with k2: + st.container(border=True).metric("Valid Values", f"{rel_valid:,}") + with k3: + st.container(border=True).metric("Mean", f"{float(rel_mean):.4f}" if rel_mean is not None and pd.notna(rel_mean) else "N/A") + with k4: + st.container(border=True).metric("Std Dev", f"{float(rel_std):.4f}" if rel_std is not None and pd.notna(rel_std) else "N/A") + + k5, k6, k7 = st.columns(3) + with k5: + st.container(border=True).metric("CV", f"{rel_cv:.2f}%") + with k6: + st.container(border=True).metric("Reliability Level", rel_level) + with k7: + st.container(border=True).metric("Thresholds", f"≤ {rel_high_max:.2f} / ≤ {rel_moderate_max:.2f}") + + col_left, col_right = st.columns([2, 1], gap="medium") + + with col_left: + st.subheader("Value Distribution") + if not histogram_values: + st.info("No valid numeric values are available for the selected reliability column.") + else: + fig_hist = px.histogram( + x=histogram_values, + nbins=40, + labels={'x': rel_col, 'y': 'count'}, + ) + fig_hist.update_layout(height=360, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_hist, use_container_width=True) + st.caption("Distribution of valid numeric values for the selected column.") + + st.subheader("Rolling Stability (Index Order)") + if rolling_cv_points: + roll_df = pd.DataFrame(rolling_cv_points) + fig_roll = go.Figure() + fig_roll.add_trace(go.Scatter( + x=roll_df['index'], + y=roll_df['CV%'], + mode='lines', + name='CV%' + )) + fig_roll.add_hline(y=rel_high_max, line_dash='dash') + fig_roll.add_hline(y=rel_moderate_max, line_dash='dot') + fig_roll.update_layout( + height=300, + margin=dict(l=0, r=0, t=10, b=0), + xaxis_title='index', + yaxis_title='CV%' + ) + st.plotly_chart(fig_roll, use_container_width=True) + st.caption( + f"Rolling window = {rolling_window} rows. Dashed line = High Reliability threshold, dotted line = Moderate Reliability threshold." + ) + else: + st.info("Not enough numeric samples to show rolling stability.") + + with col_right: + st.subheader("Interpretation") + if rel_level == 'High Reliability': + st.success("✅ High Reliability: The selected column shows low relative variability.") + elif rel_level == 'Moderate Reliability': + st.warning("⚠️ Moderate Reliability: The selected column shows noticeable variability and should be reviewed.") + else: + st.error("🚨 Low Reliability: The selected column shows high relative variability.") + + st.markdown("---") + st.subheader("Reliability Notes") + st.info( + f"**Column:** `{rel_col}`\n\n" + f"**CV:** `{rel_cv:.2f}%`\n\n" + f"**Rule:** `High ≤ {rel_high_max:.2f}% / Moderate ≤ {rel_moderate_max:.2f}% / Otherwise Low`" + ) + + if rel_warning: + st.markdown("---") + st.subheader("Warnings") + st.warning(rel_warning) + + st.markdown("---") + st.subheader("Recommended Actions") + with st.container(border=True): + st.markdown( + "✅ **Action 1**: Verify the selected field’s unit and scaling. CV can become unstable when the mean is close to zero.\n\n" + "🔧 **Action 2**: If Reliability is low, inspect outliers, sensor calibration, or mixed populations in the selected column." + ) + + # --------------------------------------------------------- + # [CASE 1.6] Uniqueness 상세 분석 + # --------------------------------------------------------- + elif st.session_state['view_mode'] == 'detail_uniq': + col_back, col_title = st.columns([1, 5]) + with col_back: + if st.button("⬅️ Back to Analysis", use_container_width=True): + st.session_state['view_mode'] = 'summary'; st.rerun() + + uniq_score = res.get('Uniqueness', None) + uniq_status = cfg.get('uniq_status', 'ok') + uniq_na_reason = cfg.get('uniq_na_reason') + uniq_key_cols = cfg.get('uniq_key_cols') or [] + uniq_mode_label = cfg.get('uniq_mode_label') or 'Current Dataset Only' + uniq_basis_label = cfg.get('uniq_basis_label') or ('1 key column' if len(uniq_key_cols) == 1 else (f'{len(uniq_key_cols)} key columns' if uniq_key_cols else 'All columns')) + uniq_basis_notes = cfg.get('uniq_basis_notes') or ('Duplicates are evaluated based on the selected key columns only.' if uniq_key_cols else 'Duplicates are evaluated using full-row duplicates across ALL columns.') + uniq_key_caption = cfg.get('uniq_key_caption') or '' + uniq_duplicate_preview_rows = cfg.get('uniq_duplicate_preview_rows') or [] + uniq_duplicate_preview_count = int(cfg.get('uniq_duplicate_preview_count', 0) or 0) + + if uniq_score is None or uniq_status == 'n/a': + st.markdown(""" +
+

Uniqueness Analysis

+ N/A +
+

Uniqueness could not be computed for the current dataset.

+ """, unsafe_allow_html=True) + st.info(f"Uniqueness = N/A. {uniq_na_reason or 'Please check the selected key columns and dataset.'}") + st.stop() + + total_rows = int(cfg.get('uniq_total_rows', len(df)) or 0) + duplicate_rows = int(cfg.get('uniq_duplicate_rows', 0) or 0) + unique_rows = int(cfg.get('uniq_unique_rows', max(total_rows - duplicate_rows, 0)) or 0) + duplicate_rate = float(cfg.get('uniq_duplicate_rate', 0.0) or 0.0) + mode_label = uniq_mode_label + + st.markdown(f""" +
+

Uniqueness Analysis

+ {float(uniq_score):.2f}% + +
+

Uniqueness evaluates whether records remain unique under the selected duplicate-detection basis.

+ """, unsafe_allow_html=True) + st.markdown("---") + + k1, k2, k3, k4 = st.columns(4) + with k1: + st.container(border=True).metric("Mode", mode_label) + with k2: + st.container(border=True).metric("Total Rows", f"{total_rows:,}") + with k3: + st.container(border=True).metric("Duplicate Rows", f"{duplicate_rows:,}", f"{duplicate_rate:.2f}%" if total_rows else "0%", delta_color="inverse") + with k4: + st.container(border=True).metric("Duplicate Basis", uniq_basis_label) + + k5, k6 = st.columns(2) + with k5: + st.container(border=True).metric("Unique Rows", f"{unique_rows:,}") + with k6: + st.container(border=True).metric("Uniqueness Score", f"{float(uniq_score):.2f}%") + + if uniq_key_cols: + st.caption("Selected uniqueness keys: " + ", ".join([f"`{c}`" for c in uniq_key_cols])) + else: + st.caption("No key columns selected. Full-row duplication across all columns was used.") + st.markdown("---") + + dup_rows_df = pd.DataFrame(uniq_duplicate_preview_rows) + + col_left, col_right = st.columns([2, 1], gap="medium") + with col_left: + st.subheader("Duplicate Records") + if dup_rows_df.empty: + st.success("✨ Excellent! No duplicates detected under the selected basis.") + else: + st.dataframe(dup_rows_df, use_container_width=True) + st.caption( + f"Showing the first 100 rows from {uniq_duplicate_preview_count:,} rows that belong to duplicate groups " + f"(including the first occurrence in each group)." + ) + + st.subheader("Duplicate Rate Overview") + fig = px.pie( + values=[unique_rows, duplicate_rows], + names=['Unique', 'Duplicate'], + hole=0.6, + ) + fig.update_layout(height=300, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig, use_container_width=True) + + with col_right: + st.subheader("Basis / Notes") + st.info(uniq_basis_notes) + if uniq_key_caption: + st.caption("Key columns: " + uniq_key_caption) + + st.markdown("---") + st.subheader("Recommended Actions") + with st.container(border=True): + st.markdown( + "🔍 **Action 1**: Choose stable identifier/key columns (e.g., IDs) to evaluate meaningful uniqueness.\n\n" + "🧹 **Action 2**: If duplicates are unexpected, trace the ingestion pipeline for replays/retries and consider deduplication rules." + ) + + # --------------------------------------------------------- + # [CASE 2] Accuracy 상세 분석 + # --------------------------------------------------------- + elif st.session_state['view_mode'] == 'detail_acc': + col_back, col_title = st.columns([1, 5]) + with col_back: + if st.button("⬅️ Back to Analysis", use_container_width=True): + st.session_state['view_mode'] = 'summary'; st.rerun() + + overall_acc_score = res.get('Accuracy', None) + acc_score = overall_acc_score + acc_rule_results = cfg.get('acc_rule_results') or [] + acc_configured_rules_table = cfg.get('acc_configured_rules_table') or [] + acc_detail_rule_options = cfg.get('acc_detail_rule_options') or [] + acc_detail_rule_payloads = cfg.get('acc_detail_rule_payloads') or [] + selected_acc_rule = None + + overall_acc_checked = int(cfg.get('acc_checked_total', 0) or 0) + overall_acc_accurate = int(cfg.get('acc_accurate_total', 0) or 0) + overall_acc_inaccurate = int(cfg.get('acc_inaccurate_total', 0) or 0) + + # If Accuracy is not applicable, show a clear message and stop here. + if overall_acc_score is None: + st.markdown(f""" +
+

Accuracy Analysis

+ N/A +
+

Accuracy requires a reference value or reference dataset. This dataset does not provide one.

+ """, unsafe_allow_html=True) + st.info("Accuracy = N/A (not applicable). Provide a reference column in the same dataset or use 'External Reference File' mode to upload a reference dataset and match by keys.") + st.stop() + + # Overall header first + st.markdown(f""" +
+

Accuracy Analysis

+ {overall_acc_score:.2f}% + +
+

Overall accuracy is aggregated across all configured accuracy rules using a micro-average based on evaluated cases.

+ """, unsafe_allow_html=True) + st.markdown("---") + + kpi1, kpi2, kpi3, kpi4 = st.columns(4) + with kpi1: + st.container(border=True).metric("Overall Accuracy", f"{overall_acc_score:.2f}%") + with kpi2: + st.container(border=True).metric("Total Checked", f"{overall_acc_checked:,}") + with kpi3: + st.container(border=True).metric("Total Accurate / In Rule", f"{overall_acc_accurate:,}") + with kpi4: + st.container(border=True).metric("Total Inaccurate / Out of Rule", f"{overall_acc_inaccurate:,}") + + st.caption("Overall accuracy is computed as Σ accurate / Σ checked × 100 across all valid configured accuracy rules.") + st.markdown("---") + + if acc_rule_results: + acc_rules_df = pd.DataFrame(acc_configured_rules_table) + st.subheader("Configured Accuracy Rules") + st.dataframe(acc_rules_df, use_container_width=True) + st.markdown("---") + + st.markdown("### Accuracy Rule Selection") + selected_label = st.selectbox( + "Select Accuracy Rule to View", + acc_detail_rule_options, + index=max(len(acc_detail_rule_options) - 1, 0), + key="detail_acc_rule_selector", + help="Choose which configured accuracy rule to inspect in detail." + ) + selected_rule_payload = next( + (p for p in acc_detail_rule_payloads if p.get('label') == selected_label), + acc_detail_rule_payloads[-1] if acc_detail_rule_payloads else {} + ) + selected_acc_rule = next( + (r for r in acc_rule_results if r.get('rule_index') == selected_rule_payload.get('rule_index')), + acc_rule_results[-1] + ) + + selected_method = selected_rule_payload.get('selected_method', 'Reference Mapping') + selected_meta = selected_rule_payload.get('selected_meta') or {} + selected_column = selected_rule_payload.get('selected_column', cfg.get('target')) + selected_score = selected_rule_payload.get('selected_score', None) + selected_checked = int(selected_rule_payload.get('selected_checked', 0) or 0) + selected_accurate = int(selected_rule_payload.get('selected_accurate', 0) or 0) + selected_inaccurate = int(selected_rule_payload.get('selected_inaccurate', 0) or 0) + selected_coverage = float(selected_rule_payload.get('selected_coverage', 0.0) or 0.0) + selected_mae = selected_rule_payload.get('selected_mae', None) + selected_rmse = selected_rule_payload.get('selected_rmse', None) + + acc_score = float(selected_score) if selected_score is not None else None + + st.caption( + f"Showing detail for Rule {selected_rule_payload.get('rule_index', '?')} | " + f"Column: {selected_column or 'N/A'} | Method: {selected_method}" + ) + st.markdown("---") + else: + selected_rule_payload = {} + selected_method = 'Reference Mapping' + selected_meta = {} + selected_column = cfg.get('target') + selected_score = None + selected_checked = int(cfg.get('acc_checked', 0) or 0) + selected_accurate = int(cfg.get('acc_count', 0) or 0) + selected_inaccurate = int(cfg.get('inacc_count', 0) or 0) + selected_coverage = float(cfg.get('acc_coverage', 0.0) or 0.0) + selected_mae = cfg.get('acc_mae', None) + selected_rmse = cfg.get('acc_rmse', None) + + acc_meta_tmp = selected_meta if isinstance(selected_meta, dict) else {} + acc_task_tmp = selected_rule_payload.get('acc_task_type', acc_meta_tmp.get('task', cfg.get('acc_task_type', 'regression'))) + ref_mode_used = cfg.get('acc_ref_mode', 'Column in Dataset') + measured_label = selected_column + reference_label = cfg.get('true') if ref_mode_used == 'Column in Dataset' else cfg.get('acc_ref_value_col') + meas_keys_label = cfg.get('acc_meas_keys') or [] + ref_keys_label = cfg.get('acc_ref_keys') or [] + + if acc_task_tmp not in ('threshold', 'bounds') and acc_meta_tmp.get('task') not in ('threshold', 'bounds'): + with st.container(border=True): + st.subheader("Reference Mapping Context") + st.caption( + "This accuracy result compares the observed value against a mapped reference value, either from the same dataset or from an external reference dataset." + ) + c1, c2, c3 = st.columns(3) + with c1: + st.metric("Reference Mode", ref_mode_used) + with c2: + st.metric("Measured Column", measured_label if measured_label not in (None, '(None)') else 'N/A') + with c3: + st.metric("Reference Column", reference_label if reference_label not in (None, '(None)') else 'N/A') + c4, c5 = st.columns(2) + with c4: + st.metric("Accuracy Task", str(acc_task_tmp).replace('_', ' ').title()) + with c5: + st.metric("Comparison Basis", "Mapped Reference Value") + if ref_mode_used == 'External Reference File': + st.caption( + f"Measured keys: {', '.join(meas_keys_label) if meas_keys_label else 'N/A'} | " + f"Reference keys: {', '.join(ref_keys_label) if ref_keys_label else 'N/A'}" + ) + st.markdown("---") + + # --------------------------------------------------------- + # Threshold (Min/Max) Accuracy detail (proxy) + # --------------------------------------------------------- + if acc_task_tmp == 'threshold' or acc_meta_tmp.get('task') == 'threshold': + st.caption("Threshold method (proxy): this score is based on whether each observed value stays within the configured minimum/maximum range.") + + meas_col = selected_column + if meas_col in (None, '(None)') or meas_col not in df.columns: + st.warning("Measured Value Column is missing. Re-run analysis and select a valid measured value column.") + st.stop() + + vmin = float(acc_meta_tmp.get('min', 0.0) or 0.0) + vmax = float(acc_meta_tmp.get('max', 1.0) or 1.0) + inclusive = bool(acc_meta_tmp.get('inclusive', True)) + + y = pd.to_numeric(df[meas_col], errors='coerce') + try: + y = y.replace(list(SENTINEL_MISSING_VALUES), np.nan) + except Exception: + pass + + valid = y.notna() + checked = int(valid.sum()) + + if checked == 0: + st.warning("No valid numeric values found in the selected column. Threshold Accuracy is N/A.") + st.stop() + + if inclusive: + ok_mask = (y[valid] >= vmin) & (y[valid] <= vmax) + else: + ok_mask = (y[valid] > vmin) & (y[valid] < vmax) + + ok_cnt = int(ok_mask.sum()) + bad_cnt = int(checked - ok_cnt) + + coverage = cfg.get('acc_coverage', None) + if coverage is None: + coverage = float(checked / len(df)) if len(df) else 0.0 + + # KPIs + k1, k2, k3, k4, k5, k6 = st.columns(6) + with k1: + st.container(border=True).metric("Checked Values", f"{checked:,}") + with k2: + st.container(border=True).metric("In-Range", f"{ok_cnt:,}", f"{(ok_cnt/checked*100):.2f}%") + with k3: + st.container(border=True).metric("Out-of-Range", f"{bad_cnt:,}", f"-{(bad_cnt/checked*100):.2f}%", delta_color="inverse") + with k4: + st.container(border=True).metric("Min Threshold", f"{vmin:.4g}") + with k5: + st.container(border=True).metric("Max Threshold", f"{vmax:.4g}") + with k6: + st.container(border=True).metric("Coverage", f"{float(coverage)*100:.2f}%", help="Valid numeric values / total rows") + + st.caption( + "Threshold method (proxy): this mode does not compare against a reference value. " + "It evaluates whether measured values fall inside the configured acceptable range." + ) + st.write( + f"{ok_cnt:,} of {checked:,} checked values are inside the configured range, " + f"and {bad_cnt:,} values are outside it." + ) + + donut_df = pd.DataFrame({ + "Status": ["In-Range", "Out-of-Range"], + "Count": [ok_cnt, bad_cnt], + }) + fig_donut = px.pie( + donut_df, + names="Status", + values="Count", + hole=0.6, + title="In-Range vs Out-of-Range", + ) + fig_donut.update_traces(textinfo="label+percent") + fig_donut.update_layout(height=340, margin=dict(l=0, r=0, t=40, b=0)) + st.plotly_chart(fig_donut, use_container_width=True) + + st.markdown("---") + + # Histogram + threshold lines + st.subheader("Value Distribution") + dist_df = pd.DataFrame({meas_col: y[valid].astype(float)}) + fig = px.histogram(dist_df, x=meas_col, nbins=40) + try: + fig.add_vline(x=vmin, line_dash='dash') + fig.add_vline(x=vmax, line_dash='dash') + except Exception: + pass + fig.update_layout(height=380, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig, use_container_width=True) + st.caption("Dashed lines indicate the configured min/max thresholds.") + + st.markdown("---") + + # Out-of-range samples + bad_idx = y[valid].index[~ok_mask] + if len(bad_idx) == 0: + st.subheader("Violation Details") + st.success("No out-of-range values detected.") + st.subheader("Out-of-Range Samples") + st.success("No out-of-range values detected.") + else: + sample_df = df.loc[bad_idx].copy() + if meas_col in sample_df.columns: + cols_order = [meas_col] + [c for c in sample_df.columns if c != meas_col] + sample_df = sample_df[cols_order] + + st.subheader("Violation Details") + obs = pd.to_numeric(sample_df[meas_col], errors="coerce") + violation_detail_df = pd.DataFrame({ + "Row Index": sample_df.index, + "Observed Value": obs, + "Expected Range": [f"[{vmin:.4g}, {vmax:.4g}]"] * len(sample_df), + "Violation Type": np.where(obs < vmin, "Below Min", "Above Max"), + "Distance to Min": obs - vmin, + "Distance to Max": obs - vmax, + }) + st.dataframe(violation_detail_df.head(50), use_container_width=True) + st.caption("This table highlights why each value failed the configured threshold rule.") + + st.subheader("Out-of-Range Samples") + st.dataframe(sample_df.head(50), use_container_width=True) + st.caption("Showing up to 50 violating rows.") + + st.stop() + + # --------------------------------------------------------- + # Bounds (k-sigma) Accuracy detail (proxy) + # --------------------------------------------------------- + acc_meta2 = acc_meta_tmp + acc_task2 = acc_task_tmp + + if acc_task2 == 'bounds' or acc_meta2.get('task') == 'bounds': + st.caption( + "Bounds method (proxy): this mode does not compare against a reference value. " + "It evaluates whether measured values fall inside the statistically derived acceptable range." + ) + st.caption("Bounds mode reports the percentage of valid numeric values that fall inside the statistically derived range [center - k·std, center + k·std].") + + meas_col = selected_column + if meas_col in (None, '(None)') or meas_col not in df.columns: + st.warning("Measured Value Column is missing. Re-run analysis and select a valid measured value column.") + st.stop() + + y = pd.to_numeric(df[meas_col], errors='coerce') + try: + y = y.replace(list(SENTINEL_MISSING_VALUES), np.nan) + except Exception: + pass + + valid = y.notna() + checked = int(valid.sum()) + if checked == 0: + st.warning("No valid numeric values found in the selected column. Bounds Accuracy is N/A.") + st.stop() + + k = float(acc_meta2.get('k', 3.0) or 3.0) + center_mode = str(acc_meta2.get('center_mode', 'mean')) + center = float(acc_meta2.get('center', np.nan)) + std = float(acc_meta2.get('std', np.nan)) + low = float(acc_meta2.get('low', np.nan)) + high = float(acc_meta2.get('high', np.nan)) + + # If meta missing, recompute quickly + yv = y[valid].astype(float) + if not np.isfinite(center): + center = float(yv.median()) if center_mode == 'median' else float(yv.mean()) + if not np.isfinite(std): + std = float(yv.std(ddof=0)) + if not np.isfinite(low) or not np.isfinite(high): + low = center - k * std + high = center + k * std + + ok_mask = (yv >= low) & (yv <= high) + ok_cnt = int(ok_mask.sum()) + bad_cnt = int(checked - ok_cnt) + + coverage = cfg.get('acc_coverage', None) + if coverage is None: + coverage = float(checked / len(df)) if len(df) else 0.0 + + k1, k2, k3, k4, k5, k6, k7 = st.columns(7) + with k1: + st.container(border=True).metric("Checked Values", f"{checked:,}") + with k2: + st.container(border=True).metric("In-Range", f"{ok_cnt:,}", f"{(ok_cnt/checked*100):.2f}%") + with k3: + st.container(border=True).metric("Out-of-Range", f"{bad_cnt:,}", f"-{(bad_cnt/checked*100):.2f}%", delta_color="inverse") + with k4: + st.container(border=True).metric("Center", f"{center:.4g}") + with k5: + st.container(border=True).metric("Std Dev", f"{std:.4g}") + with k6: + st.container(border=True).metric("Lower Bound", f"{low:.4g}") + with k7: + st.container(border=True).metric("Upper Bound", f"{high:.4g}") + + k8, k9 = st.columns(2) + with k8: + st.container(border=True).metric("k (sigma)", f"{k:.4g}") + with k9: + st.container(border=True).metric("Coverage", f"{float(coverage)*100:.2f}%", help="Valid numeric values / total rows") + + st.caption( + f"Center mode: {center_mode} | Statistical range: [{low:.4g}, {high:.4g}]" + ) + st.write( + f"{ok_cnt:,} of {checked:,} checked values are inside the statistically derived range, " + f"and {bad_cnt:,} values are outside it." + ) + + donut_df = pd.DataFrame({ + "Status": ["In-Range", "Out-of-Range"], + "Count": [ok_cnt, bad_cnt], + }) + fig_donut = px.pie( + donut_df, + names="Status", + values="Count", + hole=0.6, + title="In-Range vs Out-of-Range", + ) + fig_donut.update_traces(textinfo="label+percent") + fig_donut.update_layout(height=340, margin=dict(l=0, r=0, t=40, b=0)) + st.plotly_chart(fig_donut, use_container_width=True) + + st.markdown("---") + st.subheader("Value Distribution") + dist_df = pd.DataFrame({meas_col: yv}) + fig = px.histogram(dist_df, x=meas_col, nbins=40) + try: + fig.add_vline(x=low, line_dash='dash') + fig.add_vline(x=high, line_dash='dash') + fig.add_vline(x=center, line_dash='dot') + except Exception: + pass + fig.update_layout(height=380, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig, use_container_width=True) + st.caption("Dashed lines indicate the lower/upper statistical bounds. The dotted line indicates the selected center.") + + st.markdown("---") + bad_idx = yv.index[~ok_mask] + if len(bad_idx) == 0: + st.subheader("Violation Details") + st.success("No out-of-range values detected.") + st.subheader("Out-of-Range Samples") + st.success("No out-of-range values detected.") + else: + sample_df = df.loc[bad_idx].copy() + if meas_col in sample_df.columns: + cols_order = [meas_col] + [c for c in sample_df.columns if c != meas_col] + sample_df = sample_df[cols_order] + + st.subheader("Violation Details") + obs = pd.to_numeric(sample_df[meas_col], errors="coerce") + violation_detail_df = pd.DataFrame({ + "Row Index": sample_df.index, + "Observed Value": obs, + "Expected Range": [f"[{low:.4g}, {high:.4g}]"] * len(sample_df), + "Violation Type": np.where(obs < low, "Below Lower Bound", "Above Upper Bound"), + "Distance to Low": obs - low, + "Distance to High": obs - high, + }) + st.dataframe(violation_detail_df.head(50), use_container_width=True) + st.caption("This table highlights why each value failed the configured statistical-bounds rule.") + + st.subheader("Out-of-Range Samples") + st.dataframe(sample_df.head(50), use_container_width=True) + st.caption("Showing up to 50 violating rows.") + + st.stop() + + # Use results computed during Run (standard-aligned) + total_checked = selected_checked + acc_count = selected_accurate + inacc_count = selected_inaccurate + mae = selected_mae + rmse = selected_rmse + acc_meta = acc_meta_tmp + acc_task = acc_meta.get('task', acc_task_tmp) + + # Build comparison pairs for both in-dataset and external-reference modes. + + ref_df = st.session_state.get('acc_ref_df') + pair_payload = build_accuracy_rule_pair_payload( + df=df, + cfg=cfg, + selected_rule_payload=selected_rule_payload, + ref_df=ref_df, + ) + + if pair_payload.get('acc_pair_payload_status') != 'ok': + st.warning(pair_payload.get('acc_pair_payload_reason', 'Accuracy details could not be computed.')) + st.stop() + + key_cols_for_view = pair_payload.get('acc_pair_key_cols_for_view') or [] + + if acc_task == 'classification': + total_checked = int(pair_payload.get('acc_checked_pairs', 0) or 0) + acc_count = int(pair_payload.get('acc_accurate_pairs', 0) or 0) + inacc_count = int(pair_payload.get('acc_inaccurate_pairs', 0) or 0) + coverage = float(pair_payload.get('acc_pair_coverage', 0.0) or 0.0) + ref_distribution_df = pd.DataFrame(pair_payload.get('acc_reference_distribution') or []) + mismatch_df = pd.DataFrame(pair_payload.get('acc_mismatch_records') or []) + confusion_matrix = pair_payload.get('acc_confusion_matrix') + + k1, k2, k3, k4, k5 = st.columns(5) + with k1: + st.container(border=True).metric("Checked Pairs", f"{total_checked:,}") + with k2: + st.container(border=True).metric("Accuracy Rate", f"{(acc_count/total_checked*100):.2f}%" if total_checked else "0.00%") + with k3: + st.container(border=True).metric("Error Rate", f"{(inacc_count/total_checked*100):.2f}%" if total_checked else "0.00%", delta_color="inverse") + with k4: + st.container(border=True).metric("Accurate", f"{acc_count:,}") + with k5: + st.container(border=True).metric("Coverage", f"{coverage*100:.2f}%", help="Matched pairs / measured rows") + + st.caption( + f"Checked pairs: {total_checked:,} | Accurate: {acc_count:,} | Inaccurate: {inacc_count:,} | Coverage: {coverage*100:.2f}%" + ) + + st.markdown("---") + col_donut, col_bar = st.columns([1, 1], gap="medium") + with col_donut: + st.subheader("Accuracy Rate vs Error Rate") + fig_cls = px.pie( + values=[acc_count, inacc_count], + names=['Accurate', 'Inaccurate'], + hole=0.6, + ) + fig_cls.update_layout(height=340, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_cls, use_container_width=True) + st.caption("Proportion of records whose observed value matches the reference value.") + with col_bar: + st.subheader("Reference Value Distribution") + if ref_distribution_df.empty: + st.info("No reference-value distribution is available.") + else: + fig_bar = px.bar(ref_distribution_df.head(15), x='Reference', y='Count') + fig_bar.update_layout(height=340, margin=dict(l=0, r=0, t=10, b=0), xaxis_title='Reference', yaxis_title='Count') + st.plotly_chart(fig_bar, use_container_width=True) + st.caption("Top reference labels among checked pairs.") + + if confusion_matrix: + st.markdown("---") + st.subheader("Binary Classification Summary") + cm_df = pd.DataFrame( + confusion_matrix.get('data', []), + index=confusion_matrix.get('rows', []), + columns=confusion_matrix.get('columns', []), + ) + st.dataframe(cm_df, use_container_width=True) + + st.markdown("---") + st.subheader("Observed vs Reference Mismatch Records") + if mismatch_df.empty: + st.success("No mismatched records detected.") + else: + st.dataframe(mismatch_df, use_container_width=True) + st.caption("Showing up to 100 records where the observed value does not match the mapped reference value.") + st.stop() + + total_checked = int(pair_payload.get('acc_checked_pairs', 0) or 0) + pair_acc_count = int(pair_payload.get('acc_accurate_pairs', 0) or 0) + pair_inacc_count = int(pair_payload.get('acc_inaccurate_pairs', 0) or 0) + coverage = float(pair_payload.get('acc_pair_coverage', 0.0) or 0.0) + mae = pair_payload.get('acc_pair_mae', mae) + rmse = pair_payload.get('acc_pair_rmse', rmse) + tol_type = pair_payload.get('acc_pair_tol_type', cfg.get('acc_tol_type', 'absolute')) + tol_value = float(pair_payload.get('acc_pair_tol_value', cfg.get('acc_tol_value', 0.0)) or 0.0) + trace_df = pd.DataFrame(pair_payload.get('acc_trace_records') or []) + residual_values = pair_payload.get('acc_residual_values') or [] + agree_df = pd.DataFrame(pair_payload.get('acc_agreement_points') or []) + error_df = pd.DataFrame(pair_payload.get('acc_error_records') or []) + calibration_caption = pair_payload.get('acc_calibration_caption') + + if tol_value > 0: + acc_count = pair_acc_count + inacc_count = pair_inacc_count + + kpi1, kpi2, kpi3, kpi4, kpi5, kpi6 = st.columns(6) + with kpi1: + st.container(border=True).metric("Checked Pairs", f"{total_checked:,}") + with kpi2: + st.container(border=True).metric("Accurate", f"{acc_count:,}", f"{(acc_count/total_checked*100):.1f}%" if total_checked and tol_value > 0 else None) + with kpi3: + st.container(border=True).metric("Inaccurate", f"{inacc_count:,}", f"-{(inacc_count/total_checked*100):.1f}%" if total_checked and tol_value > 0 else None, delta_color="inverse") + with kpi4: + st.container(border=True).metric("Coverage", f"{coverage*100:.2f}%", help="Matched pairs / measured rows") + with kpi5: + st.container(border=True).metric("MAE", f"{float(mae):.4f}" if mae is not None else 'N/A') + with kpi6: + st.container(border=True).metric("RMSE", f"{float(rmse):.4f}" if rmse is not None else 'N/A') + + st.caption( + f"Checked pairs: {total_checked:,} | Accurate: {acc_count:,} | Inaccurate: {inacc_count:,} | Coverage: {coverage*100:.2f}%" + ) + + acc_mode = cfg.get('acc_mode', 'Direct Compare') + if tol_value > 0: + st.caption(f"Comparison Mode: {acc_mode} | Tolerance: {tol_type} = {tol_value} | Coverage: {coverage*100:.2f}%") + else: + st.caption(f"Comparison Mode: {acc_mode} | Coverage: {coverage*100:.2f}%") + + if tol_value > 0: + st.subheader("Accuracy Rate vs Error Rate") + fig_acc_rate = px.pie( + values=[acc_count, inacc_count], + names=['Within Tolerance', 'Outside Tolerance'], + hole=0.6, + ) + fig_acc_rate.update_layout(height=320, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_acc_rate, use_container_width=True) + st.caption("This donut chart summarizes how many checked pairs fall within the configured tolerance versus outside it.") + st.markdown("---") + else: + st.info("No tolerance is configured, so regression accuracy is summarized primarily through MAE, RMSE, residual distribution, and the observed vs reference error table below.") + st.markdown("---") + + if calibration_caption: + st.caption(calibration_caption) + + st.markdown("---") + st.subheader("Time-series Trace Analysis") + col_ts, col_hist = st.columns([2, 1], gap="medium") + + with col_ts: + if trace_df.empty: + st.info("No valid records to plot.") + else: + fig_ts = go.Figure() + fig_ts.add_trace(go.Scatter( + x=trace_df['x'], + y=trace_df['Measured_or_Calibrated'], + mode='lines', + name='Measured (Calibrated)' if acc_mode == 'Calibrated Compare (Linear)' else 'Measured', + connectgaps=False, + )) + fig_ts.add_trace(go.Scatter( + x=trace_df['x'], + y=trace_df['Reference'], + mode='lines', + name='Reference', + line=dict(dash='dash'), + connectgaps=False, + )) + fig_ts.update_layout( + height=380, + margin=dict(l=0, r=0, t=10, b=0), + legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1) + ) + st.plotly_chart(fig_ts, use_container_width=True) + st.caption(f"Detailed comparison of the first {len(trace_df):,} checked pairs, sorted by time or row order, so observed and reference traces can be compared directly.") + + with col_hist: + if not residual_values: + st.info("No residual distribution is available.") + else: + fig_res = px.histogram( + x=residual_values, + nbins=25, + labels={'x': 'Residual (Measured - Reference)', 'count': 'count'}, + ) + fig_res.update_layout(height=380, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_res, use_container_width=True) + st.caption("Distribution of residuals, where residual = measured (or calibrated) value minus reference value.") + + st.subheader("Agreement Plot") + st.caption("Scatter plot to visualize agreement between the measured value (or calibrated value) and the reference value.") + + if agree_df.empty: + st.info("Agreement plot is N/A (no valid numeric pairs).") + else: + fig_agree = go.Figure() + fig_agree.add_trace(go.Scatter( + x=agree_df['Reference'], + y=agree_df['Measured_or_Calibrated'], + mode='markers', + name='pairs', + marker=dict(size=6, opacity=0.5) + )) + + vmin = float(np.nanmin([agree_df['Reference'].min(), agree_df['Measured_or_Calibrated'].min()])) + vmax = float(np.nanmax([agree_df['Reference'].max(), agree_df['Measured_or_Calibrated'].max()])) + fig_agree.add_trace(go.Scatter( + x=[vmin, vmax], + y=[vmin, vmax], + mode='lines', + name='y = x', + line=dict(dash='dash') + )) + + fig_agree.update_layout( + height=520, + margin=dict(l=0, r=0, t=10, b=0), + xaxis_title='Reference', + yaxis_title='Measured (or Calibrated)' + ) + st.plotly_chart(fig_agree, use_container_width=True) + st.caption("The dashed line indicates perfect agreement (y = x); points farther from the line represent larger disagreement between observed and reference values.") + + st.markdown("---") + st.subheader("Observed vs Reference Error Records") + if error_df.empty: + st.success("No inaccurate records detected under the current tolerance setting.") + else: + st.dataframe(error_df, use_container_width=True) + st.caption("Showing up to 100 records with the largest disagreement between the observed value and the reference value, including residual and absolute error details.") + + + + # --------------------------------------------------------- + # [CASE 3] Consistency 상세 분석 (고도화 버전) + # --------------------------------------------------------- + elif st.session_state['view_mode'] == 'detail_cons': + # 상단 네비게이션 + col_nav, col_empty = st.columns([1, 5]) + with col_nav: + if st.button("⬅️ Back to Analysis", use_container_width=True): + st.session_state['view_mode'] = 'summary'; st.rerun() + + overall_cons_score = res.get('Consistency', None) + cons_score = overall_cons_score + cons_rule_results = cfg.get('cons_rule_results') or [] + cons_configured_rules_table = cfg.get('cons_configured_rules_table') or [] + cons_detail_rule_options = cfg.get('cons_detail_rule_options') or [] + cons_detail_rule_payloads = cfg.get('cons_detail_rule_payloads') or [] + selected_cons_rule = None + + overall_cons_passed = int(cfg.get('cons_passed_total', 0) or 0) + overall_cons_violations = int(cfg.get('cons_violations_total', 0) or 0) + overall_cons_evaluated = int(cfg.get('cons_checked_total', overall_cons_passed + overall_cons_violations) or 0) + + score_label = f"{float(overall_cons_score):.1f}%" if overall_cons_score is not None else "N/A" + st.markdown(f""" +
+

Consistency Analysis

+ {score_label} + +
+

Overall consistency is aggregated across all configured consistency rules using a micro-average based on evaluated cases.

+ """, unsafe_allow_html=True) + st.markdown("---") + + k1, k2, k3, k4 = st.columns(4) + with k1: + st.container(border=True).metric("Overall Consistency", score_label) + with k2: + st.container(border=True).metric("Total Evaluated", f"{overall_cons_evaluated:,}") + with k3: + st.container(border=True).metric("Total Passed", f"{overall_cons_passed:,}") + with k4: + st.container(border=True).metric("Total Violations", f"{overall_cons_violations:,}") + + st.caption("Overall consistency is computed as Σ passed / Σ evaluated × 100 across all valid configured consistency rules.") + st.markdown("---") + + if cons_rule_results: + cons_rules_df = pd.DataFrame(cons_configured_rules_table) + st.subheader("Configured Consistency Rules") + st.dataframe(cons_rules_df, use_container_width=True) + st.markdown("---") + + st.markdown("### Consistency Rule Selection") + selected_label = st.selectbox( + "Select Consistency Rule to View", + cons_detail_rule_options, + index=max(len(cons_detail_rule_options) - 1, 0), + key="detail_cons_rule_selector", + help="Choose which configured consistency rule to inspect in detail." + ) + selected_rule_payload = next( + (p for p in cons_detail_rule_payloads if p.get('label') == selected_label), + cons_detail_rule_payloads[-1] if cons_detail_rule_payloads else {} + ) + selected_cons_rule = next( + (r for r in cons_rule_results if r.get('rule_index') == selected_rule_payload.get('rule_index')), + cons_rule_results[-1] + ) + + selected_mode = selected_rule_payload.get('selected_mode', cfg.get('cons_mode_used', cfg.get('cons_mode', 'Expression'))) + selected_column = selected_rule_payload.get('selected_column', cfg.get('cons_format_col_used', None)) + selected_score = selected_rule_payload.get('selected_score', None) + selected_violation_count = int(selected_rule_payload.get('selected_violation_count', 0) or 0) + selected_match_count = int(selected_rule_payload.get('selected_match_count', 0) or 0) + selected_rule_used = selected_rule_payload.get('selected_rule_used', cfg.get('cons_rule_used', cfg.get('rule', ''))) + selected_message = selected_rule_payload.get('selected_message', None) + selected_violation_sample = selected_rule_payload.get('selected_violation_sample', []) or [] + selected_match_sample = selected_rule_payload.get('selected_match_sample', []) or [] + selected_regex_pattern = selected_rule_payload.get('selected_regex_pattern', None) + selected_allow_empty = bool(selected_rule_payload.get('selected_allow_empty', True)) + selected_cons_mode_used = selected_rule_payload.get('cons_mode_used', cfg.get('cons_mode_used', cfg.get('cons_mode', 'expression'))) + selected_cons_format_used = selected_rule_payload.get('cons_format_used', None) + + cons_score = float(selected_score) if selected_score is not None else None + + if selected_message and selected_score is None: + st.warning(f"Selected rule status: {selected_message}") + + st.caption( + f"Showing detail for Rule {selected_rule_payload.get('rule_index', '?')} | " + f"Column: {selected_column or 'N/A'} | Mode: {selected_mode}" + ) + st.markdown("---") + else: + selected_rule_payload = {} + selected_mode = cfg.get('cons_mode_used', cfg.get('cons_mode', 'Expression')) + selected_column = cfg.get('cons_format_col_used', None) + selected_score = None + selected_violation_count = int(cfg.get('cons_violation_count', 0) or 0) + selected_match_count = int(cfg.get('cons_match_count', 0) or 0) + selected_rule_used = cfg.get('cons_rule_used', cfg.get('rule', '')) + selected_message = None + selected_violation_sample = cfg.get('cons_violation_sample', []) or [] + selected_match_sample = cfg.get('cons_match_sample', []) or [] + selected_regex_pattern = cfg.get('cons_regex_pattern_used', None) + selected_allow_empty = bool(cfg.get('cons_allow_empty_used', True)) + selected_cons_mode_used = cfg.get('cons_mode_used', cfg.get('cons_mode', 'expression')) + selected_cons_format_used = cfg.get('cons_format_used', None) + + cons_mode_used = selected_cons_mode_used + total_recs = int(cfg.get('cons_total_rows', len(df)) or 0) + v_count = selected_violation_count + match_count = selected_match_count if selected_match_count > 0 else max(total_recs - v_count, 0) + violation_sample = pd.DataFrame(selected_violation_sample or []) + match_sample = pd.DataFrame(selected_match_sample or []) + regex_pattern_used = selected_regex_pattern + format_used = selected_cons_format_used + format_col_used = selected_column + allow_empty_used = selected_allow_empty + rule_used = selected_rule_used + passed_count = match_count if match_count > 0 else max(total_recs - v_count, 0) + if cons_mode_used == 'format': + mode_label = "Regex / Format Check" + elif cons_mode_used == 'referential_integrity': + mode_label = "Referential Integrity Check" + elif cons_mode_used == 'rule_builder': + mode_label = "Rule Builder Check" + else: + mode_label = "Logical Check" + rule_label = regex_pattern_used if cons_mode_used == 'format' else rule_used + if not rule_label: + rule_label = "N/A" + + kpi1, kpi2, kpi3, kpi4 = st.columns(4) + with kpi1: + st.container(border=True).metric("Applied Mode", mode_label) + with kpi2: + st.container(border=True).metric("Passed Records", f"{passed_count:,}") + with kpi3: + st.container(border=True).metric( + "Violations", + f"{v_count:,}", + f"{(v_count/total_recs)*100:.1f}%" if total_recs else "0.0%", + delta_color="inverse" + ) + with kpi4: + if cons_mode_used == 'format': + st.container(border=True).metric("Validated Column", f"{format_col_used or 'N/A'}") + elif cons_mode_used == 'referential_integrity': + st.container(border=True).metric("Rule Type", "Referential Integrity") + elif cons_mode_used == 'rule_builder': + st.container(border=True).metric("Rule Type", "Rule Builder") + else: + st.container(border=True).metric("Rule Type", "Expression") + + st.caption( + "Consistency score is computed as the percentage of rows that satisfy the selected logical rule or format/regex validation." + ) + + col_left, col_right = st.columns([2, 1], gap="medium") + + with col_left: + st.subheader("Consistency Compliance Overview") + fig_pie = px.pie( + values=[passed_count, v_count], + names=['Passed', 'Violated'], + color=['Passed', 'Violated'], + color_discrete_map={'Passed': '#10b981', 'Violated': '#ef4444'}, + hole=0.6 + ) + fig_pie.update_layout(height=320, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_pie, use_container_width=True) + st.caption("Share of records that passed or violated the selected consistency rule.") + + st.subheader("Violation Rate by Applied Rule") + pass_rate = float((passed_count / total_recs) * 100) if total_recs else 0.0 + violation_rate = float((v_count / total_recs) * 100) if total_recs else 0.0 + rule_name = format_used if cons_mode_used == 'format' else rule_used + if not rule_name: + rule_name = mode_label + rule_label_short = str(rule_name) + if len(rule_label_short) > 40: + rule_label_short = rule_label_short[:40] + "..." + + rule_rate_df = pd.DataFrame({ + 'Rule': [rule_label_short, rule_label_short], + 'Metric': ['Pass Rate', 'Violation Rate'], + 'Rate': [pass_rate, violation_rate], + }) + fig_rule_rate = px.bar( + rule_rate_df, + x='Rate', + y='Metric', + orientation='h', + text=rule_rate_df['Rate'].map(lambda v: f"{v:.1f}%"), + color='Metric', + color_discrete_map={'Pass Rate': '#10b981', 'Violation Rate': '#ef4444'} + ) + fig_rule_rate.update_layout( + height=260, + margin=dict(l=0, r=0, t=10, b=0), + xaxis_title='Rate (%)', + yaxis_title=None, + showlegend=False, + xaxis_range=[0, 100] + ) + fig_rule_rate.update_traces(textposition='outside', cliponaxis=False) + st.plotly_chart(fig_rule_rate, use_container_width=True) + st.caption("This chart shows the pass rate and violation rate for the currently applied consistency rule.") + + st.markdown("---") + + if cons_mode_used in ('format', 'referential_integrity'): + st.subheader("Matching / Valid Samples") + if match_sample.empty: + st.info("No matching samples are available for this format/regex validation.") + else: + match_display_df = match_sample.copy() + if format_col_used not in (None, '(None)', 'N/A') and format_col_used in match_display_df.columns: + match_display_df = match_display_df[[format_col_used]].copy() + st.dataframe(match_display_df.head(50), use_container_width=True) + st.caption("Showing up to 50 records that satisfy the currently applied consistency rule.") + + st.subheader("Violating / Inconsistent Samples") + if violation_sample.empty: + st.success("✨ Excellent! No records violate the currently applied consistency rule.") + else: + violation_display_df = violation_sample.copy() + if format_col_used not in (None, '(None)', 'N/A') and format_col_used in violation_display_df.columns: + violation_display_df = violation_display_df[[format_col_used]].copy() + st.dataframe(violation_display_df.head(50), use_container_width=True) + st.caption(f"Showing up to 50 records from {v_count:,} violating rows.") + else: + st.subheader("Logical Violation List") + if violation_sample.empty: + st.success("✨ Excellent! No records violate the defined consistency rule.") + else: + st.dataframe(violation_sample.head(100), use_container_width=True) + st.caption(f"Showing the first {min(100, len(violation_sample))} out of {v_count:,} violating records.") + + with col_right: + st.subheader("Rule Definition") + if cons_mode_used == 'format': + st.info( + f"**Validation Column:** `{format_col_used or 'N/A'}`\n\n" + f"**Preset:** `{format_used or 'N/A'}`\n\n" + f"**Regex Pattern:** `{regex_pattern_used or 'N/A'}`" + ) + st.caption( + "Allow empty values: enabled (empty/null → PASS)" + if allow_empty_used else + "Allow empty values: disabled (empty/null → violation)" + ) + st.markdown( + "Format/regex validation checks whether each value matches the selected preset or custom regular expression." + ) + elif cons_mode_used == 'referential_integrity': + st.info( + f"**Source Column:** `{format_col_used or 'N/A'}`\n\n" + f"**Rule Type:** `Referential Integrity`\n\n" + f"**Rule:** `{rule_used or 'N/A'}`" + ) + st.caption( + "Allow empty values: enabled (empty/null → PASS)" + if allow_empty_used else + "Allow empty values: disabled (empty/null → violation)" + ) + st.markdown( + "Referential integrity validation checks whether each source key exists in the uploaded reference dataset." + ) + else: + st.info(f"**Current Rule:**\n\n`{rule_used}`") + st.markdown( + "Logical consistency rules validate business constraints such as range checks, cross-column logic, or conditional dependencies." + ) + + st.markdown("---") + st.markdown("**Validation Status**") + if cons_score is None: + st.info("Consistency could not be computed for the current configuration.") + elif float(cons_score) == 100: + st.success("✅ Perfect Consistency: The dataset fully complies with the selected rule.") + elif float(cons_score) >= 90: + st.warning("⚠️ Minor Inconsistency: Some records deviate from the selected rule.") + else: + st.error("🚨 Critical Violation: Significant consistency issues were detected.") + + st.markdown("---") + st.subheader("Metric Details") + d1, d2 = st.columns(2) + with d1: + st.container(border=True).metric("Total Rows", f"{total_recs:,}") + with d2: + st.container(border=True).metric("Match Count", f"{match_count:,}") + st.container(border=True).metric("Violation Count", f"{v_count:,}") + st.container(border=True).metric("Violation Rate", f"{((v_count / total_recs) * 100):.2f}%" if total_recs else "0.00%") + if cons_mode_used == 'format' and regex_pattern_used: + st.caption("The stored regex pattern below was used to determine pass/fail status for the selected column.") + st.code(str(regex_pattern_used), language='regex') + + st.markdown("---") + st.subheader("Recommended Actions") + with st.container(border=True): + if cons_mode_used == 'format': + st.markdown( + "🛡️ **Action 1**: Review source-system validation so values in the selected column conform to the intended regex/format rule.\n\n" + "🔍 **Action 2**: Inspect violating rows to determine whether the issue is caused by parsing, encoding, or manual entry errors." + ) + elif cons_mode_used == 'referential_integrity': + st.markdown( + "🛡️ **Action 1**: Review foreign-key or identifier relationships so source keys are always registered in the reference dataset before ingestion.\n\n" + "🔍 **Action 2**: Inspect violating rows to identify missing master-data records, synchronization issues, or mapping mismatches between source and reference datasets." + ) + else: + st.markdown( + f"🛡️ **Action 1**: Strengthen source-level validation by enforcing the consistency rule: `{rule_used}` to prevent abnormal data entry.\n\n" + "🔍 **Action 2**: Investigate the records listed in the violation table to identify potential data corruption or rule violations during transmission/ingestion." + ) + + # --------------------------------------------------------- + # [CASE 4] Timeliness 상세 분석 (고도화 버전) + # --------------------------------------------------------- + elif st.session_state['view_mode'] == 'detail_time': + # 상단 네비게이션 + col_nav, col_empty = st.columns([1, 5]) + with col_nav: + if st.button("⬅️ Back to Analysis", use_container_width=True): + st.session_state['view_mode'] = 'summary'; st.rerun() + + time_val = res.get('Timeliness', None) + time_details = cfg if isinstance(cfg, dict) else {} + time_status = time_details.get('time_status', 'ok') + time_na_reason = time_details.get('time_na_reason') + reference_mode = time_details.get('t_reference_mode', 'column') + reference_time = time_details.get('t_reference_time', '') + event_col = time_details.get('t_event', cfg.get('t_event')) + system_col = time_details.get('t_system', cfg.get('t_system')) + mode_label = time_details.get('time_reference_mode_label') or ("System Time Column" if reference_mode == 'column' else "User-defined Reference Time") + basis_label = time_details.get('time_basis_label') or (system_col if reference_mode == 'column' else (reference_time or 'N/A')) + basis_desc = time_details.get('time_basis_desc') or ("System Time Column" if reference_mode == 'column' else "Reference Time") + avg_lat = float(time_details.get('time_avg_latency', 0.0) or 0.0) + p95_lat = float(time_details.get('time_p95_latency', 0.0) or 0.0) + max_lat = float(time_details.get('time_max_latency', 0.0) or 0.0) + min_lat = float(time_details.get('time_min_latency', 0.0) or 0.0) + sla = float(time_details.get('time_sla_seconds', 0.0) or 0.0) + valid_pairs = int(time_details.get('time_valid_pairs', 0) or 0) + negative_pairs = int(time_details.get('time_negative_pairs', 0) or 0) + latency_hist_values = time_details.get('time_latency_hist_values') or [] + latency_trend_df = pd.DataFrame(time_details.get('time_latency_trend_records') or []) + diagnostics = time_details.get('time_diagnostics') or {} + latency_describe = time_details.get('time_latency_describe') or {} + + if time_val is None or time_status == 'n/a': + st.markdown(""" +
+

Timeliness Analysis

+ N/A +
+

Timeliness requires valid time inputs for the selected mode.

+ """, unsafe_allow_html=True) + st.info(f"Timeliness = N/A: {time_na_reason or 'Required time inputs are missing or invalid.'}") + st.stop() + + + st.markdown(f""" +
+

Timeliness Analysis

+ {time_val:.2f}% + +
+

Timeliness measures the delay between Event Time and the selected time reference.

+ """, unsafe_allow_html=True) + + st.markdown("---") + + info1, info2, info3 = st.columns(3) + with info1: + st.container(border=True).metric("Reference Mode", mode_label) + with info2: + st.container(border=True).metric("Event Time Column", event_col or 'N/A') + with info3: + st.container(border=True).metric(basis_desc, basis_label if basis_label not in (None, '') else 'N/A') + + + with st.expander("🔎 Timeliness parsing diagnostics", expanded=False): + st.write(diagnostics) + + + kpi1, kpi2, kpi3, kpi4 = st.columns(4) + with kpi1: + st.container(border=True).metric("Average Latency (s)", f"{avg_lat:.4f}s") + with kpi2: + st.container(border=True).metric("SLA Threshold (s)", f"{sla:.4f}s") + with kpi3: + st.container(border=True).metric("95th Percentile (s)", f"{p95_lat:.4f}s") + with kpi4: + st.container(border=True).metric("Max Latency (s)", f"{max_lat:.4f}s", delta_color="inverse") + + kpi5, kpi6, kpi7 = st.columns(3) + with kpi5: + st.container(border=True).metric("Min Latency (s)", f"{min_lat:.4f}s") + with kpi6: + st.container(border=True).metric("Valid Pairs", f"{valid_pairs:,}") + with kpi7: + st.container(border=True).metric("Negative Pairs", f"{negative_pairs:,}") + + # 2. 메인 분석 영역 + col_left, col_right = st.columns([2, 1], gap="medium") + + with col_left: + st.subheader("⏳ Latency Distribution") + if not latency_hist_values: + st.info("No valid non-negative latency values are available.") + else: + fig_hist = px.histogram( + x=latency_hist_values, + nbins=30, + color_discrete_sequence=['#27ae60'], + labels={'x': 'Latency (Seconds)', 'y': 'Frequency'} + ) + fig_hist.update_layout(height=350, margin=dict(l=0, r=0, t=10, b=0), bargap=0.1) + st.plotly_chart(fig_hist, use_container_width=True) + st.caption("Distribution of the time delay across all analyzed records.") + + st.subheader("Latency Trend over Time") + if latency_trend_df.empty: + st.info("No valid Event Time values are available for the latency trend chart.") + else: + fig_scatter = px.scatter( + latency_trend_df, + x='Event_Time', + y='Latency', + opacity=0.4, + color_discrete_sequence=['#27ae60'] + ) + fig_scatter.update_layout(height=350, margin=dict(l=0, r=0, t=10, b=0)) + st.plotly_chart(fig_scatter, use_container_width=True) + st.caption("Tracking how latency fluctuates throughout the data collection period.") + + with col_right: + st.subheader("Network Status Diagnostics") + if avg_lat < 0.1: + st.success("✅ Real-time: Excellent network performance with minimal delay.") + elif avg_lat < 0.5: + st.warning("⚠️ Near Real-time: Slight delay detected. Monitor network stability.") + else: + st.error("🚨 High Latency: Significant transmission delay. Check gateway load.") + + st.markdown("---") + + st.markdown("**Latency Statistics Summary**") + if not latency_describe: + st.info("No valid non-negative latency values are available.") + else: + st.write(pd.Series(latency_describe)) + + st.markdown("---") + + st.subheader("Recommended Actions") + with st.container(border=True): + st.markdown(""" + 🚀 **Action 1**: In case of network delays, adjust **MQTT QoS settings** or optimize the payload size to reduce transmission time. + + ⚙️ **Action 2**: Verify **gateway performance** and check for potential bottlenecks in the data ingestion stage of the oneM2M server. + """) + + # --------------------------------------------------------- + # [CASE 5] 요약 대시보드 모드 (Main Summary) + # --------------------------------------------------------- + else: + main_avg_score = float(cfg.get('main_avg_score', 0.0) or 0.0) + main_comp_chart_rows = cfg.get('main_comp_chart_rows') or [] + main_comp_columns_total = int(cfg.get('main_comp_columns_total', 0) or 0) + main_comp_worst_n = int(cfg.get('main_comp_worst_n', 0) or 0) + + main_rel_status = cfg.get('main_rel_status', 'ok') + main_rel_score = cfg.get('main_rel_score', None) + main_rel_label = cfg.get('main_rel_label', 'N/A') + + main_uniq_score = cfg.get('main_uniq_score', None) + main_uniq_label = cfg.get('main_uniq_label', 'N/A') + main_uniq_basis = cfg.get('main_uniq_basis', 'all_columns') + main_uniq_duplicate_rows = int(cfg.get('main_uniq_duplicate_rows', 0) or 0) + main_uniq_total_rows = int(cfg.get('main_uniq_total_rows', 0) or 0) + + main_acc_score = cfg.get('main_acc_score', None) + main_acc_label = cfg.get('main_acc_label', 'N/A') + main_acc_preview_rows = cfg.get('main_acc_preview_rows') or [] + main_acc_mode = cfg.get('main_acc_mode', 'Direct Compare') + main_acc_tol_type = cfg.get('main_acc_tol_type', 'absolute') + main_acc_tol_value = cfg.get('main_acc_tol_value', '') + main_acc_na_title = cfg.get('main_acc_na_title', ACC_NA_TITLE) + main_acc_na_body = cfg.get('main_acc_na_body', ACC_NA_BODY) + main_acc_na_cta = cfg.get('main_acc_na_cta', ACC_NA_CTA) + + main_cons_score = cfg.get('main_cons_score', None) + main_cons_violation_count = int(cfg.get('main_cons_violation_count', 0) or 0) + main_cons_error = cfg.get('main_cons_error', None) + + main_time_score = cfg.get('main_time_score', None) + main_time_label = cfg.get('main_time_label', 'N/A') + main_time_hist_values = cfg.get('main_time_hist_values') or [] + main_time_na_title = cfg.get('main_time_na_title', TIME_NA_TITLE) + main_time_na_body = cfg.get('main_time_na_body', TIME_NA_BODY) + main_time_na_cta = cfg.get('main_time_na_cta', TIME_NA_CTA) + + main_profile_dtype_rows = cfg.get('main_profile_dtype_rows') or [] + main_profile_heatmap_rows = cfg.get('main_profile_heatmap_rows') or [] + main_profile_heatmap_columns = cfg.get('main_profile_heatmap_columns') or [] + main_profile_heatmap_has_missing = bool(cfg.get('main_profile_heatmap_has_missing', False)) + + h_col1, h_col2 = st.columns([4, 1]) + with h_col1: + st.markdown(f"## {st.session_state['filename']}") + st.caption(f"Assessment Date: {datetime.now().strftime('%Y-%m-%d')}") + with h_col2: + st.markdown(f'
{main_avg_score:.1f}%
', unsafe_allow_html=True) + + st.markdown("---") + m_c1, m_c2 = st.columns([1.2, 1], gap="medium") + + with m_c1: + st.subheader("Metric Details") + + # 1. Completeness 카드 + if 'Completeness' in res: + score = res['Completeness'] + with st.expander(f"🏳️ Completeness ({score:.1f}%)", expanded=True): + comp_view = pd.DataFrame(main_comp_chart_rows) + worst_n = main_comp_worst_n + if comp_view.empty: + st.info("No completeness preview is available.") + else: + chart_h = max(260, 24 * max(worst_n, 1)) + fig_comp = px.bar( + comp_view, + x='Score', + y='Column', + orientation='h', + height=chart_h, + color_discrete_sequence=['#2563eb'], + text='Score Label' if 'Score Label' in comp_view.columns else None, + ) + fig_comp.update_layout( + margin=dict(l=0, r=0, t=0, b=0), + xaxis_title=None, + yaxis_title=None, + xaxis_range=[0, 100] + ) + fig_comp.update_traces(textposition='outside', cliponaxis=False) + st.plotly_chart(fig_comp, use_container_width=True) + st.caption(f"Showing the {worst_n} columns with the lowest completeness (out of {main_comp_columns_total} columns).") + + if st.button("🔍 View Detailed Completeness", key="btn_comp", use_container_width=True): + st.session_state['view_mode'] = 'detail_comp'; st.rerun() + + # 1.5 Reliability 카드 + if 'Reliability' in res: + with st.expander(f"🧪 Reliability (CV) ({main_rel_label})", expanded=True): + if main_rel_score is None or main_rel_status == 'n/a': + st.info("Reliability = N/A: Select a numeric column. Reliability is computed as CV = (std/mean)*100.") + else: + st.metric("Coefficient of Variation (CV%)", f"{float(main_rel_score):.4f}%") + st.caption("Lower CV indicates more stable (less variable) measurements.") + if st.button("🔍 View Detailed Reliability", key="btn_rel", use_container_width=True): + st.session_state['view_mode'] = 'detail_rel'; st.rerun() + + # 1.6 Uniqueness 카드 + if 'Uniqueness' in res: + with st.expander(f"🧬 Uniqueness ({main_uniq_label})", expanded=True): + if main_uniq_score is None: + st.info("Uniqueness = N/A: Could not compute duplicates for this dataset.") + else: + st.metric("Uniqueness Score", f"{float(main_uniq_score):.2f}%") + if main_uniq_basis == 'selected_columns': + st.caption(f"Duplicates checked using selected columns. Duplicate rows: {main_uniq_duplicate_rows:,} / {main_uniq_total_rows:,}.") + else: + st.caption("Duplicates checked using ALL columns (full-row duplicates).") + if st.button("🔍 View Detailed Uniqueness", key="btn_uniq", use_container_width=True): + st.session_state['view_mode'] = 'detail_uniq'; st.rerun() + + # 2. Accuracy 카드 + if 'Accuracy' in res: + with st.expander(f"🎯 Accuracy ({main_acc_label})", expanded=True): + if main_acc_score is not None: + preview_df = pd.DataFrame(main_acc_preview_rows) + if preview_df.empty: + st.info("No accuracy preview is available.") + else: + fig_acc = go.Figure() + fig_acc.add_trace(go.Scatter( + x=preview_df['index'], + y=preview_df['Measured'], + name='Measured (calibrated)' if main_acc_mode == 'Calibrated Compare (Linear)' else 'Measured', + mode='lines' + )) + fig_acc.add_trace(go.Scatter( + x=preview_df['index'], + y=preview_df['Reference'], + name='Reference', + mode='lines', + line=dict(dash='dash') + )) + fig_acc.update_layout( + height=180, + margin=dict(l=0, r=0, t=0, b=0), + showlegend=False, + yaxis=dict(title=None, showticklabels=False), + xaxis=dict(title=None, showticklabels=False) + ) + st.plotly_chart(fig_acc, use_container_width=True) + st.caption("Preview chart is min-max normalized (0..1) for readability. Detailed view shows real units and error distribution.") + st.caption(f"Mode: {main_acc_mode} | Tolerance: {main_acc_tol_type} {main_acc_tol_value}") + else: + st.info(f"**{main_acc_na_title}**\n\n{main_acc_na_body}\n\n{main_acc_na_cta}") + + if st.button("🔍 View Detailed Accuracy", key="btn_acc", use_container_width=True): + st.session_state['view_mode'] = 'detail_acc'; st.rerun() + + # 3. Consistency 카드 + if 'Consistency' in res: + score = res['Consistency'] + with st.expander(f"📑 Consistency ({score:.1f}%)", expanded=True): + if main_cons_error: + st.error(f"Consistency rule evaluation failed: {main_cons_error}") + st.metric("Total Violations", f"{main_cons_violation_count} rows", delta_color="inverse") + + if st.button("🔍 View Detailed Consistency", key="btn_cons", use_container_width=True): + st.session_state['view_mode'] = 'detail_cons'; st.rerun() + + # 4. Timeliness 카드 + if 'Timeliness' in res: + with st.expander(f"⏱️ Timeliness ({main_time_label})", expanded=True): + if main_time_score is None: + st.info(f"**{main_time_na_title}**\n\n{main_time_na_body}\n\n{main_time_na_cta}") + else: + if not main_time_hist_values: + st.info("No timeliness preview is available.") + else: + fig_time = px.histogram(x=main_time_hist_values, nbins=20, color_discrete_sequence=['#27ae60']) + fig_time.update_layout(height=180, margin=dict(l=20, r=0, t=0, b=0), xaxis_title=None, yaxis_title=None) + st.plotly_chart(fig_time, use_container_width=True) + + if st.button("🔍 View Detailed Timeliness", key="btn_time", use_container_width=True): + st.session_state['view_mode'] = 'detail_time'; st.rerun() + + with m_c2: + st.subheader("Data Profile") + with st.container(border=True): + st.markdown("**Data Types**") + dtype_counts = pd.DataFrame(main_profile_dtype_rows) + if dtype_counts.empty: + st.info("No datatype profile is available.") + else: + st.plotly_chart( + px.pie(dtype_counts, values='Count', names='Type', hole=0.6, color_discrete_sequence=px.colors.qualitative.Set2), + use_container_width=True, + ) + + with st.container(border=True): + st.markdown("**Missing Values Heatmap**") + if not main_profile_heatmap_has_missing: + st.success("✨ Perfect! No missing values.") + else: + sample = pd.DataFrame(main_profile_heatmap_rows, columns=main_profile_heatmap_columns) + fig_hm = px.imshow(sample, color_continuous_scale=['#dcfce7', '#fee2e2'], aspect='auto') + fig_hm.update_layout(height=250, margin=dict(t=30, b=0, l=0, r=0), coloraxis_showscale=False) + st.plotly_chart(fig_hm, use_container_width=True) + +# ========================================================= +# TAB 3: Reports Management +# ========================================================= +with tab3: + report_avg_score = 0.0 + results_for_report = st.session_state.get('results', {}) or {} + valid_scores_for_report = [v for v in results_for_report.values() if isinstance(v, (int, float)) and v is not None] + if valid_scores_for_report: + report_avg_score = sum(valid_scores_for_report) / len(valid_scores_for_report) + + render_reports_tab( + df=st.session_state.get('dataset'), + avg_score=report_avg_score, + ) diff --git a/data_quality_assessment.py b/data_quality_assessment.py deleted file mode 100644 index 9a6e4219df92b17ab5bf180bdce822c92cbc2c93..0000000000000000000000000000000000000000 --- a/data_quality_assessment.py +++ /dev/null @@ -1,383 +0,0 @@ -# data_quality_assessment.py -import pandas as pd -import great_expectations as gx -from great_expectations.core.batch import RuntimeBatchRequest -import warnings - -# Suppress version-related warning messages from Great Expectations. -warnings.filterwarnings("ignore", category=UserWarning) - -def etsi_completeness(validator, df, required_columns): - """ - ETSI 5.1 Completeness - Accurate implementation of Dataset Completeness - Formula: C_D = (Non-null cells / Total cells) * 100 - """ - print("\n" + "="*60) - print("=== Completeness ===") - print("="*60) - - # 1. Validate each column with GX - print("\n[GX Validation]") - gx_results = [] - for column in required_columns: - result = validator.expect_column_values_to_not_be_null(column=column) - gx_results.append(result) - status = '✓ Pass' if result.success else '✗ Fail' - print(f" {column:20s}: {status}") - - # 2. Recalculate precisely using the ETSI formula - N = len(df) # Total number of rows - M_req = len(required_columns) # Number of required columns - total_cells = N * M_req - - non_null_cells = df[required_columns].notna().sum().sum() - missing_cells = total_cells - non_null_cells - - # Handle ZeroDivisionError if total_cells is 0 - etsi_score = (non_null_cells / total_cells) * 100 if total_cells > 0 else 0.0 - - print(f"\n[ETSI Calculation]") - print(f" Total cells: {total_cells} ({N} rows × {M_req} cols)") - print(f" Non-null cells: {non_null_cells}") - print(f" Missing cells: {missing_cells}") - print(f" Completeness Score: {etsi_score:.2f}%") - print(f" Formula: C_D = (Non-null / Total) * 100") - - return { - "etsi_completeness": round(etsi_score, 2), - "total_cells": total_cells, - "non_null_cells": non_null_cells, - "missing_cells": missing_cells, - "formula": "C_D = (Non-null / Total) * 100", - "gx_validation": all(r.success for r in gx_results) - } - - -def etsi_accuracy(validator, df, column, min_value=None, max_value=None, regex=None): - """ - ETSI 5.4 Accuracy - Accurate implementation - Formula: Accuracy Rate = (Correct values / Total values) * 100 - """ - print("\n" + "="*60) - print(f"=== Accuracy: {column} ===") - print("="*60) - - # 1. Validate with GX - if min_value is not None and max_value is not None: - result = validator.expect_column_values_to_be_between( - column=column, min_value=min_value, max_value=max_value - ) - print(f"\n[GX Validation (Range check: {min_value} ~ {max_value})]") - elif regex: - result = validator.expect_column_values_to_match_regex( - column=column, regex=regex - ) - print(f"\n[GX Validation (Format check)]") - else: - return {"error": "min/max or regex is required"} - - status = 'Pass' if result.success else 'Fail' - print(f" Result: {status}") - - # 2. Recalculate precisely using the ETSI formula - total_values = len(df[column]) # Total including nulls - - if min_value is not None and max_value is not None: - valid_mask = df[column].between(min_value, max_value, inclusive='both') - correct_values = valid_mask.sum() - else: - valid_mask = df[column].astype(str).str.match(regex, na=False) - correct_values = valid_mask.sum() - - incorrect_values = total_values - correct_values - etsi_score = (correct_values / total_values) * 100 if total_values > 0 else 0.0 - - print(f"\n[ETSI Calculation]") - print(f" Total values: {total_values}") - print(f" Correct values: {correct_values}") - print(f" Incorrect values: {incorrect_values}") - print(f" Accuracy Score: {etsi_score:.2f}%") - print(f" Formula: Accuracy = (Correct / Total) * 100") - - return { - "etsi_accuracy": round(etsi_score, 2), - "total_values": total_values, - "correct_values": correct_values, - "incorrect_values": incorrect_values, - "formula": "Accuracy = (Correct / Total) * 100", - "gx_validation": result.success - } - -def etsi_accuracy_weight_comparison(validator, df, listed_col, true_col, tolerance=0.0): - """ - ETSI 5.4 Accuracy - Weight comparison (Matches the document example) - Formula: Accuracy Rate = (Correct values / Total values) * 100 - """ - print("\n" + "="*60) - print(f"=== Accuracy: {listed_col} vs {true_col} ===") - print("="*60) - - # Calculate using ETSI formula - total_values = len(df) - - # Check if ListedWeight == TrueWeight (within tolerance) - weight_diff = abs(df[listed_col] - df[true_col]) - correct_mask = weight_diff <= tolerance - correct_values = correct_mask.sum() - incorrect_values = total_values - correct_values - - etsi_score = (correct_values / total_values) * 100 if total_values > 0 else 0.0 - - print(f"\n[ETSI Calculation]") - print(f" Total products: {total_values}") - print(f" Correct weights: {correct_values}") - print(f" Incorrect weights: {incorrect_values}") - print(f" Tolerance: ±{tolerance} kg") - print(f" Accuracy Score: {etsi_score:.2f}%") - print(f" Formula: Accuracy = (Correct / Total) * 100") - - # Print details of incorrect products - if incorrect_values > 0: - print(f"\n [Details of Incorrect Products]") - incorrect_df = df[~correct_mask] - for idx, row in incorrect_df.iterrows(): - diff = abs(row[listed_col] - row[true_col]) - print(f" {row['ProductID']}: Listed={row[listed_col]}kg, " - f"True={row[true_col]}kg, Diff={diff:.2f}kg") - - return { - "etsi_accuracy": round(etsi_score, 2), - "total_values": total_values, - "correct_values": correct_values, - "incorrect_values": incorrect_values, - "tolerance": tolerance, - "formula": "Accuracy = (Correct / Total) * 100" - } - - -def etsi_consistency(validator, df, column, allowed_values): - """ - ETSI 5.6 Consistency - Accurate implementation - Formula: Consistency Score = (Consistent values / Total values) * 100 - """ - print("\n" + "="*60) - print(f"=== Consistency: {column} ===") - print("="*60) - - # 1. Validate with GX - result = validator.expect_column_values_to_be_in_set( - column=column, value_set=allowed_values - ) - - status = '✓ Pass' if result.success else '✗ Fail' - print(f"\n[GX Validation]") - print(f" Result: {status}") - print(f" Allowed values: {allowed_values}") - - # 2. Recalculate precisely using the ETSI formula - total_values = len(df[column]) # Total including nulls - - consistent_mask = df[column].isin(allowed_values) - consistent_values = consistent_mask.sum() - inconsistent_values = total_values - consistent_values - - etsi_score = (consistent_values / total_values) * 100 if total_values > 0 else 0.0 - - print(f"\n[ETSI Calculation]") - print(f" Total values: {total_values}") - print(f" Consistent values: {consistent_values}") - print(f" Inconsistent values: {inconsistent_values}") - print(f" Consistency Score: {etsi_score:.2f}%") - print(f" Formula: Consistency = (Consistent / Total) * 100") - - return { - "etsi_consistency": round(etsi_score, 2), - "total_values": total_values, - "consistent_values": consistent_values, - "inconsistent_values": inconsistent_values, - "allowed_values": allowed_values, - "formula": "Consistency = (Consistent / Total) * 100", - "gx_validation": result.success - } - - -def run_etsi_validation(csv_file, dataset_name, config): - """ - Run ETSI data quality metrics (Improved version) - """ - try: - df = pd.read_csv(csv_file) - except FileNotFoundError: - print(f"\nError: Could not find the file '{csv_file}'. Skipping this test.") - return None - - print("\n" + "#"*60) - print(f"# ETSI Data Quality Assessment: {dataset_name}") - print("#"*60) - print(f"Dataset: {csv_file}") - print(f"Size: {len(df)} rows × {len(df.columns)} columns") - - context = gx.get_context(project_root_dir='./gx') # Specify GX project directory - - datasource_name = "etsi_datasource" - try: - context.get_datasource(datasource_name) - except ValueError: - context.add_datasource( - name=datasource_name, - class_name="Datasource", - execution_engine={"class_name": "PandasExecutionEngine"}, - data_connectors={ - "default_runtime_data_connector": { - "class_name": "RuntimeDataConnector", - "batch_identifiers": ["default_identifier"] - } - } - ) - - batch_request = RuntimeBatchRequest( - datasource_name=datasource_name, - data_connector_name="default_runtime_data_connector", - data_asset_name=dataset_name, - runtime_parameters={"batch_data": df}, - batch_identifiers={"default_identifier": dataset_name} - ) - - suite_name = f"{dataset_name}_suite" - try: - context.get_expectation_suite(suite_name) - except gx.exceptions.DataContextError: - context.add_expectation_suite(suite_name) - - validator = context.get_validator( - batch_request=batch_request, - expectation_suite_name=suite_name - ) - - results = {} - - # 1. Completeness - if 'required_columns' in config: - results['completeness'] = etsi_completeness( - validator, df, config['required_columns'] - ) - - # 2. Accuracy (multiple types) - results['accuracy'] = {} - if 'accuracy_checks' in config: - for column, checks in config['accuracy_checks'].items(): - results['accuracy'][column] = etsi_accuracy( - validator, df, column, - min_value=checks.get('min'), - max_value=checks.get('max'), - regex=checks.get('regex') - ) - - if 'accuracy_weight_comparison_check' in config: - check_params = config['accuracy_weight_comparison_check'] - results['accuracy']['weight_comparison'] = etsi_accuracy_weight_comparison( - validator, df, - listed_col=check_params['listed_col'], - true_col=check_params['true_col'], - tolerance=check_params.get('tolerance', 0.0) - ) - - # 3. Consistency - if 'consistency_checks' in config: - results['consistency'] = {} - for column, allowed_values in config['consistency_checks'].items(): - results['consistency'][column] = etsi_consistency( - validator, df, column, allowed_values - ) - - # Final Summary - print("\n" + "="*60) - print("Final ETSI Scores") - print("="*60) - - if 'completeness' in results: - print(f"Completeness: {results['completeness']['etsi_completeness']}%") - - if 'accuracy' in results and results['accuracy']: - for check_name, result in results['accuracy'].items(): - print(f"Accuracy ({check_name}): {result['etsi_accuracy']}%") - - if 'consistency' in results: - for col, result in results['consistency'].items(): - print(f"Consistency ({col}): {result['etsi_consistency']}%") - - print("="*60) - - return results - - -# ============================================================================== -# Main Execution Block -# ============================================================================== -if __name__ == "__main__": - - # This part is not needed in a real environment. - # It just creates dummy data files for testing. - try: - # Pump data (contains nulls) - pump_data = { - 'Timestamp': ['2023-10-01 10:00', '2023-10-01 10:01', '2023-10-01 10:02', '2023-10-01 10:03'], - 'PumpID': ['P001', 'P001', 'P002', 'P002'], - 'Temperature_C': [75.5, 76.1, None, 89.2], - 'Vibration_mm_s': [0.5, 0.52, 0.48, None], - 'Pressure_kPa': [3012, 3025, 2988, 3001] - } - pd.DataFrame(pump_data).to_csv('pump_data.csv', index=False) - - # Product weight data - products_data = { - 'ProductID': ['PROD-A', 'PROD-B', 'PROD-C', 'PROD-D'], - 'ListedWeight_kg': [5.0, 10.2, 15.0, 20.8], - 'TrueWeight_kg': [5.01, 10.5, 15.0, 20.82] - } - pd.DataFrame(products_data).to_csv('products.csv', index=False) - - # Product category data - category_data = { - 'ProductID': ['E-01', 'F-01', 'C-01', 'X-01'], - 'ProductName': ['Laptop', 'Sofa', 'T-Shirt', 'Unknown'], - 'Category': ['Electronics', 'Furniture', 'Clothing', 'Miscellaneous'], - 'Price': [1200, 800, 25, 10], - 'DiscountPrice': [1100, 750, 25, 10], - 'StockQuantity': [50, 20, 200, 500] - } - pd.DataFrame(category_data).to_csv('product_categories.csv', index=False) - except Exception as e: - print(f"Error while creating test data: {e}") - - # 1. Completeness Test (Pump Data) - print("\n" + "Running Test 1: Completeness (Pump Performance Data)") - pump_config = { - 'required_columns': ['Timestamp', 'PumpID', 'Temperature_C', - 'Vibration_mm_s', 'Pressure_kPa'] - } - pump_results = run_etsi_validation('pump_data.csv', 'pump_performance', pump_config) - - # 2. Accuracy Test (Product Weight Data) - print("\n" + "Running Test 2: Accuracy (Product Weight Data)") - products_config = { - 'accuracy_weight_comparison_check': { - 'listed_col': 'ListedWeight_kg', - 'true_col': 'TrueWeight_kg', - 'tolerance': 0.1 # Tolerance 0.1kg - } - } - products_results = run_etsi_validation('products.csv', 'products', products_config) - - # 3. Consistency Test (Product Category Data) - print("\n" + "Running Test 3: Consistency (Product Category Data)") - category_config = { - 'required_columns': ['ProductID', 'ProductName', 'Category', - 'Price', 'DiscountPrice', 'StockQuantity'], - 'consistency_checks': { - 'Category': ['Electronics', 'Furniture', 'Clothing'] # Allowed category list - } - } - category_results = run_etsi_validation('product_categories.csv', 'categories', category_config) - - print("\nAll tests completed!") \ No newline at end of file diff --git a/etsi_dq/__init__.py b/etsi_dq/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/etsi_dq/helpers/__init__.py b/etsi_dq/helpers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/etsi_dq/helpers/dashboard_helpers.py b/etsi_dq/helpers/dashboard_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..4c7e94a84384aa7e357ea384c9e6630d51fd28df --- /dev/null +++ b/etsi_dq/helpers/dashboard_helpers.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +from typing import Any, Dict, List + +import numpy as np +import pandas as pd + +from etsi_dq.metrics.timeliness import compute_timeliness_latency +from etsi_dq.utils import SENTINEL_MISSING_VALUES + + +ACC_NA_TITLE = "Accuracy = N/A" +ACC_NA_BODY = ( + "Accuracy requires a reference value or reference dataset. " + "This dataset does not provide one under the current configuration." +) +ACC_NA_CTA = ( + "Provide a reference column in the same dataset or use External Reference File mode." +) + +TIME_NA_TITLE = "Timeliness = N/A" +TIME_NA_BODY = "Timeliness requires valid time inputs for the selected mode." +TIME_NA_CTA = "Provide a valid Event Time and System/Reference Time configuration." + + +def _safe_minmax_series(values: pd.Series | Any) -> pd.Series: + if not isinstance(values, pd.Series): + values = pd.Series([values]) + s = pd.to_numeric(values, errors="coerce").astype(float) + arr = s.to_numpy() + finite = np.isfinite(arr) + if not finite.any(): + return s + mn = np.nanmin(arr) + mx = np.nanmax(arr) + if pd.isna(mn) or pd.isna(mx) or mx == mn: + return s + return (s - mn) / (mx - mn) + + + +def _safe_consistency_violation_count(df: pd.DataFrame, rule: str | None) -> tuple[int, str | None]: + rule = str(rule or "").strip() + if not rule: + return 0, None + try: + mask = df.eval(rule) + if isinstance(mask, pd.Series): + mask = mask.fillna(False).astype(bool) + return int((~mask).sum()), None + return 0, "Consistency rule did not evaluate to a row-wise boolean result." + except Exception as exc: + return 0, str(exc) + + + +def build_main_dashboard_completeness_payload( + df: pd.DataFrame, + details: Dict[str, Any] | None, +) -> Dict[str, Any]: + details = details or {} + used_cols = details.get("comp_used_cols") or df.columns.tolist() + used_cols = [c for c in used_cols if c in df.columns] + + column_scores = details.get("comp_detail_column_scores") or [] + if column_scores: + comp_df = pd.DataFrame(column_scores) + if "Score" in comp_df.columns: + comp_df["Score"] = pd.to_numeric(comp_df["Score"], errors="coerce") + else: + comp_series = (1 - df[used_cols].isnull().mean()) * 100 if used_cols else pd.Series(dtype=float) + comp_df = comp_series.reset_index() + comp_df.columns = ["Column", "Score"] + + comp_df = comp_df[[c for c in ["Column", "Score"] if c in comp_df.columns]].copy() + comp_df = comp_df.dropna(subset=["Score"]) if "Score" in comp_df.columns else comp_df + comp_df = comp_df.sort_values("Score", ascending=True) if not comp_df.empty else comp_df + + worst_n = min(12, len(comp_df)) if not comp_df.empty else 0 + comp_view = comp_df.head(worst_n).copy() if worst_n > 0 else pd.DataFrame(columns=["Column", "Score"]) + if not comp_view.empty: + comp_view["Score Label"] = comp_view["Score"].map(lambda v: f"{float(v):.1f}%") + + return { + "main_comp_score": details.get("comp_dataset_score", details.get("comp_row_score", details.get("comp_col_score", None))), + "main_comp_columns_total": int(len(comp_df)), + "main_comp_worst_n": int(worst_n), + "main_comp_chart_rows": comp_view.to_dict(orient="records"), + } + + + +def build_main_dashboard_reliability_payload(details: Dict[str, Any] | None) -> Dict[str, Any]: + details = details or {} + score = details.get("rel_overall_cv") + rel_status = details.get("rel_status", "ok") + selected_columns = details.get("rel_selected_columns") or [] + valid_count = int(details.get("rel_valid_column_count", 0) or 0) + return { + "main_rel_status": rel_status, + "main_rel_score": score, + "main_rel_selected_columns": selected_columns, + "main_rel_valid_column_count": valid_count, + "main_rel_label": f"{float(score):.2f}%" if score is not None else "N/A", + } + + + +def build_main_dashboard_uniqueness_payload(details: Dict[str, Any] | None) -> Dict[str, Any]: + details = details or {} + score = details.get("uniq_score") + basis = details.get("uniq_basis", "all_columns") + duplicate_rows = int(details.get("uniq_duplicate_rows", details.get("uniq_dup_rows", 0)) or 0) + total_rows = int(details.get("uniq_total_rows", 0) or 0) + return { + "main_uniq_score": score, + "main_uniq_label": f"{float(score):.2f}%" if score is not None else "N/A", + "main_uniq_basis": basis, + "main_uniq_duplicate_rows": duplicate_rows, + "main_uniq_total_rows": total_rows, + } + + + +def build_main_dashboard_accuracy_payload( + df: pd.DataFrame, + details: Dict[str, Any] | None, + scores: Dict[str, Any] | None = None, +) -> Dict[str, Any]: + details = details or {} + scores = scores or {} + + score = scores.get("Accuracy", details.get("acc_score")) + true_col = details.get("true", "") + meas_col = details.get("target", "") + acc_mode = details.get("acc_mode", "Direct Compare") + tol_type = details.get("acc_tol_type", "absolute") + tol_value = details.get("acc_tol_value", "") + + n_preview = min(60, len(df)) + sub_df = df.head(n_preview).reset_index(drop=True) + + y_true_src = sub_df[true_col] if true_col in sub_df.columns else pd.Series(np.nan, index=sub_df.index) + y_meas_src = sub_df[meas_col] if meas_col in sub_df.columns else pd.Series(np.nan, index=sub_df.index) + + y_true = pd.to_numeric(y_true_src, errors="coerce") + y_meas = pd.to_numeric(y_meas_src, errors="coerce") + if isinstance(y_true, pd.Series): + y_true = y_true.replace(list(SENTINEL_MISSING_VALUES), np.nan) + if isinstance(y_meas, pd.Series): + y_meas = y_meas.replace(list(SENTINEL_MISSING_VALUES), np.nan) + + y_pred = y_meas.copy() + if acc_mode == "Calibrated Compare (Linear)": + params = details.get("acc_calib_params") + if params and isinstance(params, (tuple, list)) and len(params) == 2: + a, b = float(params[0]), float(params[1]) + y_pred = a + b * y_meas + + y_true_n = _safe_minmax_series(y_true) + y_pred_n = _safe_minmax_series(y_pred) + + preview_df = pd.DataFrame( + { + "index": list(range(n_preview)), + "Measured": [None if pd.isna(v) else float(v) for v in y_pred_n.tolist()], + "Reference": [None if pd.isna(v) else float(v) for v in y_true_n.tolist()], + } + ) + + return { + "main_acc_score": score, + "main_acc_label": f"{float(score):.1f}%" if score is not None else "N/A", + "main_acc_preview_rows": preview_df.to_dict(orient="records"), + "main_acc_mode": acc_mode, + "main_acc_tol_type": tol_type, + "main_acc_tol_value": tol_value, + "main_acc_na_title": ACC_NA_TITLE, + "main_acc_na_body": ACC_NA_BODY, + "main_acc_na_cta": ACC_NA_CTA, + } + + + +def build_main_dashboard_consistency_payload( + df: pd.DataFrame, + details: Dict[str, Any] | None, + scores: Dict[str, Any] | None = None, +) -> Dict[str, Any]: + details = details or {} + scores = scores or {} + score = scores.get("Consistency") + rule = details.get("rule", details.get("cons_rule_used", "")) + violation_count, error = _safe_consistency_violation_count(df, rule) + return { + "main_cons_score": score, + "main_cons_label": f"{float(score):.1f}%" if score is not None else "N/A", + "main_cons_violation_count": int(violation_count), + "main_cons_error": error, + } + + + +def build_main_dashboard_timeliness_payload( + df: pd.DataFrame, + details: Dict[str, Any] | None, + scores: Dict[str, Any] | None = None, +) -> Dict[str, Any]: + details = details or {} + scores = scores or {} + score = scores.get("Timeliness") + + event_col = details.get("t_event") + system_col = details.get("t_system") + + lat, t_event, t_system, meta = compute_timeliness_latency( + df, + event_col=event_col, + system_col=system_col, + ) + if bool(details.get("t_auto_swapped", False)) and not bool(meta.get("auto_swapped", False)): + lat = (t_event - t_system).dt.total_seconds() + + lat_clean = lat.dropna() + hist_values = [float(v) for v in lat_clean.tolist()] + return { + "main_time_score": score, + "main_time_label": f"{float(score):.2f}%" if score is not None else "N/A", + "main_time_hist_values": hist_values, + "main_time_na_title": TIME_NA_TITLE, + "main_time_na_body": TIME_NA_BODY, + "main_time_na_cta": TIME_NA_CTA, + } + + + +def build_main_dashboard_profile_payload(df: pd.DataFrame) -> Dict[str, Any]: + dtype_counts = df.dtypes.astype(str).value_counts().reset_index() + dtype_counts.columns = ["Type", "Count"] + + sample = df.head(50).isnull().astype(int) + heatmap_rows = sample.values.tolist() + heatmap_columns = sample.columns.tolist() + heatmap_has_missing = int(sample.sum().sum()) > 0 + + return { + "main_profile_dtype_rows": dtype_counts.to_dict(orient="records"), + "main_profile_heatmap_rows": heatmap_rows, + "main_profile_heatmap_columns": heatmap_columns, + "main_profile_heatmap_has_missing": bool(heatmap_has_missing), + } + + + +def build_main_dashboard_payload( + df: pd.DataFrame, + scores: Dict[str, Any], + details: Dict[str, Any] | None, +) -> Dict[str, Any]: + details = details or {} + valid_scores = [v for v in scores.values() if v is not None] + avg_score = float(sum(valid_scores) / len(valid_scores)) if valid_scores else 0.0 + + payload: Dict[str, Any] = { + "main_avg_score": avg_score, + **build_main_dashboard_profile_payload(df), + } + + if "Completeness" in scores: + payload.update(build_main_dashboard_completeness_payload(df, details)) + if "Reliability" in scores: + payload.update(build_main_dashboard_reliability_payload(details)) + if "Uniqueness" in scores: + payload.update(build_main_dashboard_uniqueness_payload(details)) + if "Accuracy" in scores: + payload.update(build_main_dashboard_accuracy_payload(df, details, scores)) + if "Consistency" in scores: + payload.update(build_main_dashboard_consistency_payload(df, details, scores)) + if "Timeliness" in scores: + payload.update(build_main_dashboard_timeliness_payload(df, details, scores)) + + return payload \ No newline at end of file diff --git a/etsi_dq/helpers/detail_helpers.py b/etsi_dq/helpers/detail_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..b7e2ce77c995f542bb179306cd39a8cffdfbe6e1 --- /dev/null +++ b/etsi_dq/helpers/detail_helpers.py @@ -0,0 +1,993 @@ +from __future__ import annotations + +from typing import Any, Dict, List + +import pandas as pd +import numpy as np + +from etsi_dq.utils import _build_missing_mask + + +def build_completeness_detail_payload( + df: pd.DataFrame, + used_cols: List[str] | None, + missing_tokens: List[str] | None, +) -> Dict[str, Any]: + """ + Build a detail-friendly payload for Completeness. + + This helper is intended for backend use so that dashboard/frontend code does + not need to recompute completeness KPIs, heatmap input data, per-column + completeness, missing counts, or issue-report entries. + """ + used_cols = used_cols or df.columns.tolist() + used_cols = [c for c in used_cols if c in df.columns] + missing_tokens = missing_tokens or [] + + mm = _build_missing_mask(df, used_cols, missing_tokens) + + total_records = int(len(df)) + missing_cells = int(mm.sum().sum()) if not mm.empty else 0 + incomplete_records = int(mm.any(axis=1).sum()) if not mm.empty else 0 + complete_records = int(total_records - incomplete_records) + + complete_pct = (complete_records / total_records * 100.0) if total_records else 0.0 + incomplete_pct = (incomplete_records / total_records * 100.0) if total_records else 0.0 + + heatmap_df = mm.head(100).astype(int) if not mm.empty else pd.DataFrame() + heatmap_columns = [str(c) for c in heatmap_df.columns.tolist()] if not heatmap_df.empty else [] + heatmap_data = heatmap_df.values.tolist() if not heatmap_df.empty else [] + + if not mm.empty: + comp_series = (1 - mm.mean(axis=0)) * 100 + miss_count_series = mm.sum(axis=0) + miss_rate_series = mm.mean(axis=0) * 100 + else: + comp_series = pd.Series(dtype=float) + miss_count_series = pd.Series(dtype=float) + miss_rate_series = pd.Series(dtype=float) + + comp_df = comp_series.reset_index() + if not comp_df.empty: + comp_df.columns = ["Column", "Score"] + comp_df = comp_df.sort_values("Score", ascending=True) + comp_detail_column_scores = [ + { + "Column": str(row["Column"]), + "Score": float(row["Score"]), + "Score Label": f"{float(row['Score']):.1f}%", + } + for _, row in comp_df.iterrows() + ] + else: + comp_detail_column_scores = [] + + miss_count_df = miss_count_series.reset_index() + if not miss_count_df.empty: + miss_count_df.columns = ["Column", "Missing Count"] + miss_count_df = miss_count_df.sort_values("Missing Count", ascending=False) + comp_missing_count_table = [ + { + "Column": str(row["Column"]), + "Missing Count": int(row["Missing Count"]), + } + for _, row in miss_count_df.iterrows() + ] + else: + comp_missing_count_table = [] + + issue_df = miss_rate_series.reset_index() + if not issue_df.empty: + issue_df.columns = ["Column", "Missing Rate"] + issue_df = issue_df.sort_values("Missing Rate", ascending=False) + issue_df = issue_df[issue_df["Missing Rate"] > 0] + else: + issue_df = pd.DataFrame(columns=["Column", "Missing Rate"]) + + comp_issue_reports: List[Dict[str, Any]] = [] + if not issue_df.empty: + top_n = min(5, len(issue_df)) + for _, row in issue_df.head(top_n).iterrows(): + col_name = str(row["Column"]) + miss_pct = float(row["Missing Rate"]) + + if miss_pct >= 20: + severity = "High" + action = "Review mandatory-field rules, source-system mapping, or ingestion failures for this column." + elif miss_pct >= 5: + severity = "Medium" + action = "Inspect upstream data entry, parsing, or ingestion processes for this column." + else: + severity = "Low" + action = "Monitor this column and validate whether the missingness is expected or structural." + + comp_issue_reports.append( + { + "column": col_name, + "missing_rate": miss_pct, + "severity": severity, + "action": action, + "summary": f"{col_name} currently shows a {miss_pct:.1f}% missing-value rate in the selected completeness scope.", + } + ) + + return { + "comp_scope_count": len(used_cols), + "comp_total_records": total_records, + "comp_missing_cells": missing_cells, + "comp_complete_records": complete_records, + "comp_incomplete_records": incomplete_records, + "comp_complete_pct": complete_pct, + "comp_incomplete_pct": incomplete_pct, + "comp_heatmap_columns": heatmap_columns, + "comp_heatmap_data": heatmap_data, + "comp_detail_column_scores": comp_detail_column_scores, + "comp_missing_count_table": comp_missing_count_table, + "comp_issue_reports": comp_issue_reports, + } + + +def build_reliability_detail_payload( + df: pd.DataFrame, + rel_results: List[Dict[str, Any]] | None, + rel_selected_columns: List[str] | None, +) -> Dict[str, Any]: + """ + Build a detail-friendly payload for Reliability. + + This helper is intended for backend use so that dashboard/frontend code does + not need to recompute formatted tables, selector labels, clean numeric + series, histograms, rolling stability inputs, or interpretation/profile + values on the frontend. + """ + rel_results = rel_results or [] + rel_selected_columns = rel_selected_columns or [] + + valid_rel_results = [ + r for r in rel_results + if r.get("status") == "ok" and r.get("cv") is not None + ] + + detail_selector_options: List[str] = [] + detail_selector_map: List[Dict[str, Any]] = [] + configured_results_table: List[Dict[str, Any]] = [] + + for r in rel_results: + configured_results_table.append( + { + "Column": r.get("column"), + "Status": r.get("status"), + "N/A Reason": r.get("na_reason"), + "Valid Values": int(r.get("valid_count", 0) or 0), + "Mean": f"{float(r.get('mean')):.4f}" if r.get("mean") is not None else "N/A", + "Std Dev": f"{float(r.get('std')):.4f}" if r.get("std") is not None else "N/A", + "CV": f"{float(r.get('cv')):.2f}%" if r.get("cv") is not None else "N/A", + "Reliability Level": r.get("level", "N/A"), + "Warning": r.get("warning"), + "High Threshold": r.get("high_max"), + "Moderate Threshold": r.get("moderate_max"), + } + ) + + detail_results: List[Dict[str, Any]] = [] + for r in valid_rel_results: + rel_col = r.get("column") + if rel_col not in df.columns: + continue + + s = pd.to_numeric(df[rel_col], errors="coerce") + try: + s = s.replace(list([-200]), pd.NA) + except Exception: + s = s.replace(-200, pd.NA) + s_clean = s.dropna() + + histogram_values = [float(v) for v in s_clean.tolist()] + + rolling_window = None + rolling_cv_points: List[Dict[str, Any]] = [] + if len(s_clean) >= 30: + tmp = s_clean.reset_index(drop=True) + win = min(200, max(30, int(len(tmp) * 0.05))) + roll_mean = tmp.rolling(win).mean() + roll_std = tmp.rolling(win).std() + roll_cv = (roll_std / roll_mean) * 100 + roll_df = pd.DataFrame({"index": range(len(tmp)), "CV%": roll_cv}) + max_points = 5000 + if len(roll_df) > max_points: + step = max(1, int(len(roll_df) / max_points)) + roll_df = roll_df.iloc[::step, :] + rolling_window = int(win) + rolling_cv_points = [ + { + "index": int(row["index"]), + "CV%": (float(row["CV%"]) if pd.notna(row["CV%"]) else None), + } + for _, row in roll_df.iterrows() + ] + + label = f"{rel_col} | CV {float(r.get('cv', 0.0)):.2f}% | {r.get('level', 'N/A')}" + detail_selector_options.append(label) + detail_selector_map.append( + { + "label": label, + "column": rel_col, + } + ) + + detail_results.append( + { + "label": label, + "column": rel_col, + "cv": float(r.get("cv", 0.0) or 0.0), + "valid_count": int(r.get("valid_count", 0) or 0), + "mean": (float(r.get("mean")) if r.get("mean") is not None else None), + "std": (float(r.get("std")) if r.get("std") is not None else None), + "level": r.get("level", "N/A"), + "warning": r.get("warning"), + "high_max": float(r.get("high_max", 5.0) or 5.0), + "moderate_max": float(r.get("moderate_max", 15.0) or 15.0), + "histogram_values": histogram_values, + "rolling_window": rolling_window, + "rolling_cv_points": rolling_cv_points, + } + ) + + return { + "rel_configured_results_table": configured_results_table, + "rel_detail_selector_options": detail_selector_options, + "rel_detail_selector_map": detail_selector_map, + "rel_detail_results": detail_results, + "rel_detail_valid_result_count": len(detail_results), + "rel_selected_columns": rel_selected_columns, + } + + +def build_uniqueness_detail_payload( + df: pd.DataFrame, + key_cols: List[str] | None, +) -> Dict[str, Any]: + """ + Build a detail-friendly payload for Uniqueness. + + This helper is intended for backend use so that dashboard/frontend code does + not need to recompute duplicate-row subsets, basis labels, or duplicate + preview data on the frontend. + """ + key_cols = key_cols or [] + + if key_cols: + tmp = df[key_cols].copy() + dup_mask = tmp.duplicated(keep=False) + dup_rows_df = df.loc[dup_mask].copy() + mode_label = "Current Dataset Only" + basis_label = "1 key column" if len(key_cols) == 1 else f"{len(key_cols)} key columns" + basis_notes = "Duplicates are evaluated based on the selected key columns only." + key_caption = ", ".join([f"`{c}`" for c in key_cols]) + else: + tmp = df.copy() + dup_mask = tmp.duplicated(keep=False) + dup_rows_df = df.loc[dup_mask].copy() + mode_label = "Current Dataset Only" + basis_label = "All columns" + basis_notes = "Duplicates are evaluated using full-row duplicates across ALL columns." + key_caption = "" + + duplicate_rows_preview = dup_rows_df.head(100).to_dict(orient="records") if not dup_rows_df.empty else [] + + return { + "uniq_mode_label": mode_label, + "uniq_basis_label": basis_label, + "uniq_basis_notes": basis_notes, + "uniq_key_caption": key_caption, + "uniq_duplicate_preview_rows": duplicate_rows_preview, + "uniq_duplicate_preview_count": int(len(dup_rows_df)), + } + + +# Helper for Accuracy detail payloads +def build_accuracy_detail_payload( + df: pd.DataFrame, + acc_rule_results: List[Dict[str, Any]] | None, +) -> Dict[str, Any]: + """ + Build a detail-friendly payload for Accuracy. + + This helper is intended for backend use so that dashboard/frontend code does + not need to recompute configured-rule tables or selector labels on the + frontend. + """ + acc_rule_results = acc_rule_results or [] + + configured_rules_table: List[Dict[str, Any]] = [] + detail_rule_options: List[str] = [] + detail_rule_map: List[Dict[str, Any]] = [] + detail_rule_payloads: List[Dict[str, Any]] = [] + + for r in acc_rule_results: + rule_index = r.get("rule_index", "?") + column = r.get("column", "N/A") + method = r.get("method", "N/A") + score = r.get("score") + message = r.get("message") + + if score is not None: + label = f"Rule {rule_index} | {column} | {method} | {float(score):.2f}%" + elif message: + label = f"Rule {rule_index} | {column} | {method} | {message}" + else: + label = f"Rule {rule_index} | {column} | {method}" + + detail_rule_options.append(label) + detail_rule_map.append( + { + "label": label, + "rule_index": rule_index, + "column": column, + "method": method, + } + ) + meta = r.get("meta") or {} + selected_method = method + selected_column = r.get("column") + selected_score = r.get("score") + selected_checked = int(r.get("checked", 0) or 0) + selected_accurate = int(r.get("accurate", 0) or 0) + selected_inaccurate = int(r.get("inaccurate", 0) or 0) + selected_coverage = float(r.get("coverage", 0.0) or 0.0) + selected_mae = r.get("mae") + selected_rmse = r.get("rmse") + + if selected_method == "Threshold (Min/Max)": + selected_task_type = "threshold" + elif selected_method == "Statistical Bounds (k-sigma)": + selected_task_type = "bounds" + elif isinstance(meta, dict): + selected_task_type = meta.get("task", "unknown") + else: + selected_task_type = "unknown" + + detail_rule_payloads.append( + { + "label": label, + "rule_index": rule_index, + "selected_method": selected_method, + "selected_meta": meta if isinstance(meta, dict) else {}, + "selected_column": selected_column, + "selected_score": (float(selected_score) if selected_score is not None else None), + "selected_checked": selected_checked, + "selected_accurate": selected_accurate, + "selected_inaccurate": selected_inaccurate, + "selected_coverage": selected_coverage, + "selected_mae": selected_mae, + "selected_rmse": selected_rmse, + "acc_task_type": selected_task_type, + } + ) + + configured_rules_table.append( + { + "Rule #": rule_index, + "Measured Column": column, + "Method": method, + "Score": f"{float(score):.2f}%" if score is not None else "N/A", + "Checked": int(r.get("checked", 0) or 0), + "Accurate / In Rule": int(r.get("accurate", 0) or 0), + "Inaccurate / Out of Rule": int(r.get("inaccurate", 0) or 0), + "Coverage": f"{float(r.get('coverage')) * 100:.2f}%" if r.get("coverage") is not None else "N/A", + "MAE": f"{float(r.get('mae')):.4f}" if r.get("mae") is not None else "N/A", + "RMSE": f"{float(r.get('rmse')):.4f}" if r.get("rmse") is not None else "N/A", + "Status": message, + } + ) + + return { + "acc_configured_rules_table": configured_rules_table, + "acc_detail_rule_options": detail_rule_options, + "acc_detail_rule_map": detail_rule_map, + "acc_detail_rule_payloads": detail_rule_payloads, + "acc_detail_rule_count": len(acc_rule_results), + } + + +# Helper for Accuracy pair analysis payloads (classification & regression) +def _build_accuracy_classification_payload( + pairs_raw: pd.DataFrame, + key_cols_for_view: List[str] | None, + coverage: float | None, + total_rows: int, + acc_meta: Dict[str, Any] | None, +) -> Dict[str, Any]: + key_cols_for_view = key_cols_for_view or [] + acc_meta = acc_meta or {} + + y_true_raw = pairs_raw['__acc_true'].replace(list([-200]), np.nan) + y_meas_raw = pairs_raw['__acc_measured'].replace(list([-200]), np.nan) + y_true_s = y_true_raw.astype('string') + y_meas_s = y_meas_raw.astype('string') + valid = y_true_s.notna() & y_meas_s.notna() + + if int(valid.sum()) == 0: + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "No valid matched pairs found between the measured value and the reference value.", + } + + comp_df = pairs_raw.loc[valid, key_cols_for_view].copy() if key_cols_for_view else pd.DataFrame(index=pairs_raw.loc[valid].index) + comp_df['Observed'] = y_meas_s[valid].astype(str) + comp_df['Reference'] = y_true_s[valid].astype(str) + comp_df['Match'] = comp_df['Observed'] == comp_df['Reference'] + comp_df['Status'] = np.where(comp_df['Match'], 'Accurate', 'Inaccurate') + + total_checked = int(len(comp_df)) + acc_count = int(comp_df['Match'].sum()) + inacc_count = int(total_checked - acc_count) + if coverage is None: + coverage = float(total_checked / total_rows) if total_rows else 0.0 + + ref_counts = comp_df['Reference'].value_counts(dropna=False).reset_index() + ref_counts.columns = ['Reference', 'Count'] + ref_distribution = ref_counts.head(15).to_dict(orient='records') + + mismatch_df = comp_df.loc[~comp_df['Match']].copy() + display_cols = key_cols_for_view + ['Observed', 'Reference', 'Status'] + mismatch_records = mismatch_df[display_cols].head(100).to_dict(orient='records') if not mismatch_df.empty else [] + + confusion_matrix = None + if all(k in acc_meta for k in ['TP', 'TN', 'FP', 'FN']): + confusion_matrix = { + 'rows': ['Reference 0', 'Reference 1'], + 'columns': ['Predicted 0', 'Predicted 1'], + 'data': [ + [int(acc_meta['TN']), int(acc_meta['FP'])], + [int(acc_meta['FN']), int(acc_meta['TP'])], + ], + } + + return { + "acc_pair_payload_status": "ok", + "acc_pair_payload_type": "classification", + "acc_checked_pairs": total_checked, + "acc_accurate_pairs": acc_count, + "acc_inaccurate_pairs": inacc_count, + "acc_pair_coverage": float(coverage), + "acc_reference_distribution": ref_distribution, + "acc_mismatch_records": mismatch_records, + "acc_confusion_matrix": confusion_matrix, + } + + + +def _build_accuracy_regression_payload( + df: pd.DataFrame, + pairs_raw: pd.DataFrame, + key_cols_for_view: List[str] | None, + coverage: float | None, + total_rows: int, + acc_meta: Dict[str, Any] | None, + acc_mode: str, + tol_type: str, + tol_value: float, + mae: float | None, + rmse: float | None, + ref_mode_used: str, + calibration_params: Any, +) -> Dict[str, Any]: + key_cols_for_view = key_cols_for_view or [] + acc_meta = acc_meta or {} + + y_true = pd.to_numeric(pairs_raw['__acc_true'], errors='coerce').replace(list([-200]), np.nan) + y_meas = pd.to_numeric(pairs_raw['__acc_measured'], errors='coerce').replace(list([-200]), np.nan) + + valid = y_true.notna() & y_meas.notna() + if int(valid.sum()) == 0: + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "No valid matched pairs found between the measured value and the reference value.", + } + + if acc_mode == 'Calibrated Compare (Linear)' and int(valid.sum()) > 0: + params = calibration_params + if params and isinstance(params, (tuple, list)) and len(params) == 2: + a, b = float(params[0]), float(params[1]) + else: + x = y_meas[valid].to_numpy() + y = y_true[valid].to_numpy() + A = np.c_[np.ones(len(x)), x] + coef, *_ = np.linalg.lstsq(A, y, rcond=None) + a, b = float(coef[0]), float(coef[1]) + y_pred = a + b * y_meas + calibration_caption = f"Calibration (linear): reference ≈ a + b×measured | a={a:.4f}, b={b:.4f}" + else: + y_pred = y_meas + calibration_caption = None + + comp_df = pairs_raw.loc[valid, key_cols_for_view].copy() if key_cols_for_view else pd.DataFrame(index=pairs_raw.loc[valid].index) + comp_df['Observed'] = y_meas[valid].astype(float) + comp_df['Reference'] = y_true[valid].astype(float) + comp_df['Measured_or_Calibrated'] = y_pred[valid].astype(float) + comp_df['Residual'] = comp_df['Measured_or_Calibrated'] - comp_df['Reference'] + comp_df['Absolute Error'] = comp_df['Residual'].abs() + + if tol_value > 0: + if tol_type == 'relative_percent': + comp_df['Tolerance'] = comp_df['Reference'].abs() * (tol_value / 100.0) + else: + comp_df['Tolerance'] = tol_value + comp_df['Within Tolerance'] = comp_df['Absolute Error'] <= comp_df['Tolerance'] + else: + comp_df['Tolerance'] = np.nan + comp_df['Within Tolerance'] = np.nan + + total_checked = int(len(comp_df)) + acc_count = int(comp_df['Within Tolerance'].sum()) if tol_value > 0 else 0 + inacc_count = int(total_checked - acc_count) if tol_value > 0 else 0 + if coverage is None: + coverage = float(total_checked / total_rows) if total_rows else 0.0 + if mae is None: + mae = float(comp_df['Absolute Error'].mean()) + if rmse is None: + rmse = float(np.sqrt(np.mean(comp_df['Residual'] ** 2))) + + x_axis = None + if ref_mode_used == 'Column in Dataset' and 'Date' in df.columns and 'Time' in df.columns: + try: + date_s = df['Date'].astype(str).str.strip() + time_s = df['Time'].astype(str).str.strip().str.replace('.', ':', regex=False) + x_axis = pd.to_datetime(date_s + " " + time_s, errors='coerce', dayfirst=True) + if x_axis.notna().sum() == 0: + x_axis = None + except Exception: + x_axis = None + + if x_axis is None: + x_axis = pd.Series(range(len(comp_df)), index=comp_df.index) + else: + x_axis = pd.Series(x_axis, index=df.index).loc[comp_df.index] + + comp_df['x'] = x_axis + comp_df = comp_df.dropna(subset=['x']) + if comp_df.empty: + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "No valid records to plot after cleaning timestamps and missing values.", + } + + comp_df = comp_df.sort_values('x') + trace_df = comp_df.head(150).copy() + + trace_records = trace_df[['x', 'Measured_or_Calibrated', 'Reference']].copy() + trace_records['x'] = trace_records['x'].astype(str) + trace_records = trace_records.to_dict(orient='records') + + residual_values = comp_df['Residual'].dropna().astype(float).tolist() + + agree_df = comp_df[['Reference', 'Measured_or_Calibrated']].dropna().copy() + if len(agree_df) > 5000: + agree_df = agree_df.sample(5000, random_state=42) + agreement_points = agree_df.to_dict(orient='records') if not agree_df.empty else [] + + error_df = comp_df.copy() + if tol_value > 0: + error_df = error_df.loc[~error_df['Within Tolerance']].copy() + else: + error_df = error_df.sort_values('Absolute Error', ascending=False) + + display_cols = key_cols_for_view + ['Observed', 'Reference', 'Measured_or_Calibrated', 'Residual', 'Absolute Error'] + if tol_value > 0: + display_cols += ['Tolerance', 'Within Tolerance'] + error_records = error_df[display_cols].head(100).to_dict(orient='records') if not error_df.empty else [] + + return { + "acc_pair_payload_status": "ok", + "acc_pair_payload_type": "regression", + "acc_checked_pairs": total_checked, + "acc_accurate_pairs": acc_count, + "acc_inaccurate_pairs": inacc_count, + "acc_pair_coverage": float(coverage), + "acc_pair_mae": float(mae) if mae is not None else None, + "acc_pair_rmse": float(rmse) if rmse is not None else None, + "acc_pair_tol_type": tol_type, + "acc_pair_tol_value": float(tol_value), + "acc_trace_records": trace_records, + "acc_residual_values": residual_values, + "acc_agreement_points": agreement_points, + "acc_error_records": error_records, + "acc_calibration_caption": calibration_caption, + } + + + +def build_accuracy_pair_analysis_payload( + df: pd.DataFrame, + pairs_raw: pd.DataFrame, + key_cols_for_view: List[str] | None, + acc_task: str, + coverage: float | None, + total_rows: int, + acc_meta: Dict[str, Any] | None, + acc_mode: str = 'Direct Compare', + tol_type: str = 'absolute', + tol_value: float = 0.0, + mae: float | None = None, + rmse: float | None = None, + ref_mode_used: str = 'Column in Dataset', + calibration_params: Any = None, +) -> Dict[str, Any]: + """ + Build a detail-friendly payload for Accuracy pair analysis. + + This helper is intended for backend use so that dashboard/frontend code does + not need to recompute classification/regression comparison tables, traces, + distributions, mismatch/error tables, or pair-level KPIs. + """ + if acc_task == 'classification': + return _build_accuracy_classification_payload( + pairs_raw=pairs_raw, + key_cols_for_view=key_cols_for_view, + coverage=coverage, + total_rows=total_rows, + acc_meta=acc_meta, + ) + + return _build_accuracy_regression_payload( + df=df, + pairs_raw=pairs_raw, + key_cols_for_view=key_cols_for_view, + coverage=coverage, + total_rows=total_rows, + acc_meta=acc_meta, + acc_mode=acc_mode, + tol_type=tol_type, + tol_value=tol_value, + mae=mae, + rmse=rmse, + ref_mode_used=ref_mode_used, + calibration_params=calibration_params, + ) + + +# Wrapper for building rule-specific Accuracy pair-analysis payloads +def build_accuracy_rule_pair_payload( + df: pd.DataFrame, + cfg: Dict[str, Any], + selected_rule_payload: Dict[str, Any] | None, + ref_df: pd.DataFrame | None = None, +) -> Dict[str, Any]: + """ + Build a rule-specific Accuracy pair-analysis payload from the selected rule. + + This wrapper resolves the measured/reference pairing context for the selected + rule, builds `pairs_raw`, and then delegates to + `build_accuracy_pair_analysis_payload(...)` so dashboard code does not need + to rebuild pair-generation logic on the frontend. + """ + selected_rule_payload = selected_rule_payload or {} + + selected_column = selected_rule_payload.get("selected_column", cfg.get("target")) + selected_meta = selected_rule_payload.get("selected_meta") or {} + acc_task_type = selected_rule_payload.get( + "acc_task_type", + selected_meta.get("task", cfg.get("acc_task_type", "regression")), + ) + selected_coverage = selected_rule_payload.get("selected_coverage", cfg.get("acc_coverage")) + selected_mae = selected_rule_payload.get("selected_mae", cfg.get("acc_mae")) + selected_rmse = selected_rule_payload.get("selected_rmse", cfg.get("acc_rmse")) + + ref_mode_used = cfg.get("acc_ref_mode", "Column in Dataset") + total_rows = int(len(df)) + acc_mode = cfg.get("acc_mode", "Direct Compare") + tol_type = cfg.get("acc_tol_type", "absolute") + tol_value = float(cfg.get("acc_tol_value", 0.0) or 0.0) + calibration_params = cfg.get("acc_calib_params") + + if ref_mode_used == "External Reference File": + ref_value_col = cfg.get("acc_ref_value_col", "(None)") + meas_keys = cfg.get("acc_meas_keys") or [] + ref_keys = cfg.get("acc_ref_keys") or [] + + if ref_df is None or selected_column in (None, "(None)") or ref_value_col in (None, "(None)"): + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "Accuracy details are unavailable because the external reference configuration is incomplete.", + } + if not meas_keys or not ref_keys or len(meas_keys) != len(ref_keys): + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "Accuracy details are unavailable because the matching key columns are incomplete.", + } + + missing_meas = [k for k in meas_keys if k not in df.columns] + missing_ref = [k for k in ref_keys if k not in ref_df.columns] + if missing_meas or missing_ref: + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "Accuracy details are unavailable because one or more matching key columns are missing.", + } + + meas_join = df[meas_keys + [selected_column]].copy() + ref_join = ref_df[ref_keys + [ref_value_col]].copy() + key_map = {rk: mk for rk, mk in zip(ref_keys, meas_keys)} + ref_join = ref_join.rename(columns=key_map) + pairs_raw = meas_join.merge(ref_join, on=meas_keys, how="inner") + if pairs_raw.empty: + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "No valid matched pairs found between the measured value and the reference value.", + } + pairs_raw = pairs_raw.rename(columns={selected_column: "__acc_measured", ref_value_col: "__acc_true"}) + key_cols_for_view = list(meas_keys) + else: + true_col = cfg.get("true") + if selected_column in (None, "(None)") or true_col in (None, "(None)"): + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "Accuracy details are unavailable because the measured value column or reference column is missing.", + } + if selected_column not in df.columns or true_col not in df.columns: + return { + "acc_pair_payload_status": "n/a", + "acc_pair_payload_reason": "Accuracy details are unavailable because the measured value column or reference column is missing from the dataset.", + } + pairs_raw = df[[selected_column, true_col]].copy().rename( + columns={selected_column: "__acc_measured", true_col: "__acc_true"} + ) + key_cols_for_view = [] + + payload = build_accuracy_pair_analysis_payload( + df=df, + pairs_raw=pairs_raw, + key_cols_for_view=key_cols_for_view, + acc_task=acc_task_type, + coverage=selected_coverage, + total_rows=total_rows, + acc_meta=selected_meta, + acc_mode=acc_mode, + tol_type=tol_type, + tol_value=tol_value, + mae=selected_mae, + rmse=selected_rmse, + ref_mode_used=ref_mode_used, + calibration_params=calibration_params, + ) + + payload.update( + { + "acc_pair_ref_mode": ref_mode_used, + "acc_pair_measured_column": selected_column, + "acc_pair_reference_column": cfg.get("true") if ref_mode_used == "Column in Dataset" else cfg.get("acc_ref_value_col"), + "acc_pair_meas_keys": list(cfg.get("acc_meas_keys") or []), + "acc_pair_ref_keys": list(cfg.get("acc_ref_keys") or []), + "acc_pair_key_cols_for_view": key_cols_for_view, + } + ) + return payload + + +# Helper for Consistency detail payloads +def build_consistency_detail_payload( + cons_rule_results: List[Dict[str, Any]] | None, +) -> Dict[str, Any]: + """ + Build a detail-friendly payload for Consistency. + + This helper is intended for backend use so that dashboard/frontend code does + not need to recompute configured-rule tables, selector labels, selected-rule + context, or overall aggregate counts on the frontend. + """ + cons_rule_results = cons_rule_results or [] + + overall_cons_passed = sum( + int(r.get("match_count", 0) or 0) + for r in cons_rule_results + if r.get("score") is not None + ) + overall_cons_violations = sum( + int(r.get("violation_count", 0) or 0) + for r in cons_rule_results + if r.get("score") is not None + ) + overall_cons_evaluated = int(overall_cons_passed + overall_cons_violations) + + configured_rules_table: List[Dict[str, Any]] = [] + detail_rule_options: List[str] = [] + detail_rule_payloads: List[Dict[str, Any]] = [] + + for r in cons_rule_results: + rule_index = r.get("rule_index", "?") + mode = r.get("mode", "N/A") + column = r.get("column", "N/A") + score = r.get("score") + message = r.get("message") + + if score is not None: + label = f"Rule {rule_index} | {column} | {mode} | {float(score):.2f}%" + elif message: + label = f"Rule {rule_index} | {column} | {mode} | {message}" + else: + label = f"Rule {rule_index} | {column} | {mode}" + + detail_rule_options.append(label) + + configured_rules_table.append( + { + "Rule #": rule_index, + "Mode": mode, + "Column": column, + "Score": f"{float(score):.2f}%" if score is not None else "N/A", + "Passed": int(r.get("match_count", 0) or 0), + "Violations": int(r.get("violation_count", 0) or 0), + "Rule / Preset": r.get("rule_used"), + "Status": message, + } + ) + + selected_mode = r.get("mode", "Expression") + selected_column = r.get("column") + selected_score = float(score) if score is not None else None + selected_violation_count = int(r.get("violation_count", 0) or 0) + selected_match_count = int(r.get("match_count", 0) or 0) + selected_rule_used = r.get("rule_used") + selected_message = message + selected_violation_sample = r.get("violation_sample", []) or [] + selected_match_sample = r.get("match_sample", []) or [] + selected_regex_pattern = r.get("regex_pattern_used") + selected_allow_empty = bool(r.get("allow_empty", True)) + + if selected_mode == "Rule Builder": + cons_mode_used = "rule_builder" + cons_format_used = None + elif selected_mode == "Format preset": + cons_mode_used = "format" + cons_format_used = selected_rule_used + elif selected_mode == "Referential Integrity": + cons_mode_used = "referential_integrity" + cons_format_used = "referential_integrity" + else: + cons_mode_used = "expression" + cons_format_used = None + + detail_rule_payloads.append( + { + "label": label, + "rule_index": rule_index, + "selected_mode": selected_mode, + "selected_column": selected_column, + "selected_score": selected_score, + "selected_violation_count": selected_violation_count, + "selected_match_count": selected_match_count, + "selected_rule_used": selected_rule_used, + "selected_message": selected_message, + "selected_violation_sample": selected_violation_sample, + "selected_match_sample": selected_match_sample, + "selected_regex_pattern": selected_regex_pattern, + "selected_allow_empty": selected_allow_empty, + "cons_mode_used": cons_mode_used, + "cons_format_used": cons_format_used, + } + ) + + return { + "cons_checked_total": overall_cons_evaluated, + "cons_passed_total": int(overall_cons_passed), + "cons_violations_total": int(overall_cons_violations), + "cons_configured_rules_table": configured_rules_table, + "cons_detail_rule_options": detail_rule_options, + "cons_detail_rule_payloads": detail_rule_payloads, + "cons_detail_rule_count": len(cons_rule_results), + } + + +# Helper for Timeliness detail payloads +def build_timeliness_detail_payload(df: pd.DataFrame, details: Dict[str, Any] | None) -> Dict[str, Any]: + """ + Build a detail-friendly payload for Timeliness. + + This helper is intended for backend use so that dashboard/frontend code does + not need to recompute latency series, descriptive stats, chart inputs, or + diagnostics on the frontend. + """ + details = details or {} + + reference_mode = details.get("t_reference_mode", "column") + reference_time = details.get("t_reference_time", "") + event_col = details.get("t_event") + system_col = details.get("t_system") + + if reference_mode == "fixed_time": + t_event = pd.to_datetime(df[event_col], errors="coerce") if event_col in df.columns else pd.Series(pd.NaT, index=df.index) + ref_ts = pd.to_datetime(str(reference_time).strip(), errors="coerce") if str(reference_time or "").strip() else pd.NaT + + if pd.isna(ref_ts): + return { + "time_detail_status": "n/a", + "time_detail_reason": "Reference Time could not be parsed as a valid datetime.", + } + + if hasattr(t_event.dt, 'tz') and t_event.dt.tz is not None: + t_event = t_event.dt.tz_localize(None) + if getattr(ref_ts, 'tzinfo', None) is not None: + ref_ts = ref_ts.tz_localize(None) + + t_system = pd.Series(ref_ts, index=df.index) + lat = (t_system - t_event).dt.total_seconds() + meta = { + 'auto_swapped': False, + 'event_time_nat': int(t_event.isna().sum()), + 'system_time_nat': 0, + 'valid_pairs_after_parse': int(lat.dropna().shape[0]), + 'negative_latency_pairs': int((lat.dropna() < 0).sum()), + } + else: + from etsi_dq.metrics.timeliness import compute_timeliness_latency + + lat, t_event, t_system, meta = compute_timeliness_latency( + df, + event_col=event_col, + system_col=system_col, + ) + + if bool(details.get('t_auto_swapped', False)) and not bool(meta.get('auto_swapped', False)): + lat = (t_event - t_system).dt.total_seconds() + + lat_clean = lat.dropna() + lat_clean_nonneg = lat_clean[lat_clean >= 0] + + avg_lat = float(details.get('t_avg_latency')) if details.get('t_avg_latency') is not None else (float(lat_clean_nonneg.mean()) if not lat_clean_nonneg.empty else 0.0) + p95_lat = float(details.get('t_p95_latency')) if details.get('t_p95_latency') is not None else (float(lat_clean_nonneg.quantile(0.95)) if not lat_clean_nonneg.empty else 0.0) + max_lat = float(details.get('t_max_latency')) if details.get('t_max_latency') is not None else (float(lat_clean_nonneg.max()) if not lat_clean_nonneg.empty else 0.0) + min_lat = float(details.get('t_min_latency')) if details.get('t_min_latency') is not None else (float(lat_clean_nonneg.min()) if not lat_clean_nonneg.empty else 0.0) + sla = float(details.get('t_sla_seconds', 0.0) or 0.0) + valid_pairs = int(details.get('t_pairs_valid', int(lat.dropna().shape[0]))) + negative_pairs = int(details.get('t_negative_latency', int((lat.dropna() < 0).sum()))) + + latency_hist_values = [float(v) for v in lat_clean_nonneg.tolist()] + + lat_df = pd.DataFrame({ + 'Event_Time': t_event, + 'Latency': lat, + }).dropna(subset=['Event_Time']).sort_values('Event_Time') + latency_trend_records = [] + if not lat_df.empty: + tmp = lat_df.copy() + tmp['Event_Time'] = tmp['Event_Time'].astype(str) + latency_trend_records = tmp.to_dict(orient='records') + + diagnostics = { + "rows": int(details.get('t_total_rows', len(df))), + "reference_mode": reference_mode, + "event_time_col": event_col, + "system_time_col": system_col, + "reference_time": reference_time, + "event_time_NaT": int(details.get('t_event_nat', int(t_event.isna().sum()))), + "system_time_NaT": int(details.get('t_system_nat', int(getattr(t_system, 'isna', lambda: pd.Series([], dtype=bool))().sum()) if hasattr(t_system, 'isna') else 0)), + "valid_pairs_after_parse": int(details.get('t_pairs_valid', int(lat.dropna().shape[0]))), + "negative_latency_pairs": int(details.get('t_negative_latency', int((lat.dropna() < 0).sum()))), + } + + describe_dict = {} + if not lat_clean_nonneg.empty: + desc = lat_clean_nonneg.describe(percentiles=[.5, .75, .9, .95, .99]) + describe_dict = {str(k): float(v) for k, v in desc.to_dict().items()} + + mode_label = "System Time Column" if reference_mode == 'column' else "User-defined Reference Time" + basis_label = system_col if reference_mode == 'column' else (reference_time or 'N/A') + basis_desc = "System Time Column" if reference_mode == 'column' else "Reference Time" + + return { + "time_detail_status": "ok", + "time_reference_mode_label": mode_label, + "time_basis_label": basis_label if basis_label not in (None, '') else 'N/A', + "time_basis_desc": basis_desc, + "time_avg_latency": avg_lat, + "time_p95_latency": p95_lat, + "time_max_latency": max_lat, + "time_min_latency": min_lat, + "time_sla_seconds": sla, + "time_valid_pairs": valid_pairs, + "time_negative_pairs": negative_pairs, + "time_latency_hist_values": latency_hist_values, + "time_latency_trend_records": latency_trend_records, + "time_diagnostics": diagnostics, + "time_latency_describe": describe_dict, + } \ No newline at end of file diff --git a/etsi_dq/helpers/summary_helpers.py b/etsi_dq/helpers/summary_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..653129eb9d1beee53046af0f38ec12dd1daa9296 --- /dev/null +++ b/etsi_dq/helpers/summary_helpers.py @@ -0,0 +1,357 @@ +from __future__ import annotations + +from typing import Any, Dict, List + +import pandas as pd + +from etsi_dq.utils import _build_missing_mask + + +def build_consistency_summary_payload(cons_rule_results: List[Dict[str, Any]] | None) -> Dict[str, Any]: + """ + Build a summary-friendly payload for Consistency from raw rule results. + + This helper is intended for pipeline/backend use so that dashboard summary + rendering does not need to recompute totals, configured columns/modes, or + table/profile data on the frontend. + """ + cons_rule_results = cons_rule_results or [] + + evaluated_rules = [r for r in cons_rule_results if r.get("score") is not None] + + passed_total = sum(int(r.get("match_count", 0) or 0) for r in evaluated_rules) + violations_total = sum(int(r.get("violation_count", 0) or 0) for r in evaluated_rules) + evaluated_total = passed_total + violations_total + + columns_used: List[str] = [] + modes_used: List[str] = [] + rules_applied: List[str] = [] + summary_table: List[Dict[str, Any]] = [] + + for r in cons_rule_results: + rule_index = r.get("rule_index") + mode = str(r.get("mode", "N/A") or "N/A") + column = r.get("column") + rule_used = r.get("rule_used") + message = r.get("message") + score = r.get("score") + match_count = int(r.get("match_count", 0) or 0) + violation_count = int(r.get("violation_count", 0) or 0) + + if column not in (None, "(None)", "N/A"): + col_str = str(column) + if col_str not in columns_used: + columns_used.append(col_str) + else: + col_str = "N/A" + + if mode not in modes_used: + modes_used.append(mode) + + rule_desc = f"{col_str} → {mode}" + if rule_used: + rule_desc += f": {rule_used}" + elif message: + rule_desc += f" ({message})" + rules_applied.append(rule_desc) + + summary_table.append( + { + "Rule #": rule_index, + "Mode": mode, + "Column": col_str, + "Score": f"{float(score):.2f}%" if score is not None else "N/A", + "Passed": match_count, + "Violations": violation_count, + "Rule / Preset": rule_used, + "Status": message, + } + ) + + return { + "cons_rule_count": len(cons_rule_results), + "cons_passed_total": passed_total, + "cons_violations_total": violations_total, + "cons_evaluated_total": evaluated_total, + "cons_columns_used": columns_used, + "cons_modes_used": modes_used, + "cons_rules_applied": rules_applied, + "cons_summary_table": summary_table, + } + + +def build_reliability_summary_payload(rel_results: List[Dict[str, Any]] | None, rel_selected_columns: List[str] | None) -> Dict[str, Any]: + """ + Build a summary-friendly payload for Reliability from per-column reliability results. + + This helper is intended for pipeline/backend use so that dashboard summary + rendering does not need to recompute valid-column filtering, representative + profile data, or summary table data on the frontend. + """ + rel_results = rel_results or [] + rel_selected_columns = rel_selected_columns or [] + + valid_rel_results = [ + r for r in rel_results + if r.get("status") == "ok" and r.get("cv") is not None + ] + + valid_column_count = len(valid_rel_results) + + cv_values = [float(r.get("cv")) for r in valid_rel_results if r.get("cv") is not None] + overall_cv = sum(cv_values) / len(cv_values) if cv_values else None + + representative = valid_rel_results[0] if valid_rel_results else None + representative_column = representative.get("column") if representative else None + representative_cv = float(representative.get("cv")) if representative and representative.get("cv") is not None else None + representative_level = representative.get("level") if representative else None + representative_warning = representative.get("warning") if representative else None + + summary_table: List[Dict[str, Any]] = [] + for r in valid_rel_results: + summary_table.append( + { + "Column": r.get("column"), + "Valid Values": int(r.get("valid_count", 0) or 0), + "Mean": f"{float(r.get('mean')):.4f}" if r.get("mean") is not None else "N/A", + "Std Dev": f"{float(r.get('std')):.4f}" if r.get("std") is not None else "N/A", + "CV": f"{float(r.get('cv')):.2f}%" if r.get("cv") is not None else "N/A", + "Reliability Level": r.get("level", "N/A"), + "Warning": r.get("warning"), + "High Threshold": f"≤ {float(r.get('high_max')):.2f}%" if r.get("high_max") is not None else "N/A", + "Moderate Threshold": f"≤ {float(r.get('moderate_max')):.2f}%" if r.get("moderate_max") is not None else "N/A", + } + ) + + return { + "rel_selected_columns": rel_selected_columns, + "rel_valid_column_count": valid_column_count, + "rel_overall_cv": overall_cv, + "rel_representative_column": representative_column, + "rel_representative_cv": representative_cv, + "rel_representative_level": representative_level, + "rel_representative_warning": representative_warning, + "rel_summary_table": summary_table, + } + + +def build_uniqueness_summary_payload( + total_rows: int, + duplicate_rows: int, + key_cols: List[str] | None, +) -> Dict[str, Any]: + """ + Build a summary-friendly payload for Uniqueness. + + This helper is intended for pipeline/backend use so that dashboard summary + rendering does not need to recompute unique-row counts, duplicate rates, or + profile/basis labels on the frontend. + """ + total_rows = int(total_rows or 0) + duplicate_rows = int(duplicate_rows or 0) + key_cols = key_cols or [] + + unique_rows = max(total_rows - duplicate_rows, 0) + duplicate_rate = (duplicate_rows / total_rows) * 100.0 if total_rows > 0 else 0.0 + + if key_cols: + basis = "selected_key_columns" + profile_mode = "Current Dataset Only" + profile_basis = "Selected key columns" + else: + basis = "all_columns" + profile_mode = "Current Dataset Only" + profile_basis = "Full-row duplicate check across all columns" + + return { + "uniq_total_rows": total_rows, + "uniq_duplicate_rows": duplicate_rows, + "uniq_unique_rows": unique_rows, + "uniq_duplicate_rate": duplicate_rate, + "uniq_key_cols": key_cols, + "uniq_basis": basis, + "uniq_profile_mode": profile_mode, + "uniq_profile_basis": profile_basis, + } + + +def build_accuracy_summary_payload(acc_rule_results: List[Dict[str, Any]] | None) -> Dict[str, Any]: + """ + Build a summary-friendly payload for Accuracy from raw rule results. + + This helper is intended for pipeline/backend use so that dashboard summary + rendering does not need to recompute totals, configured columns/methods, + profile descriptions, or summary table data on the frontend. + """ + acc_rule_results = acc_rule_results or [] + + evaluated_rules = [r for r in acc_rule_results if r.get("score") is not None] + + checked_total = sum(int(r.get("checked", 0) or 0) for r in evaluated_rules) + accurate_total = sum(int(r.get("accurate", 0) or 0) for r in evaluated_rules) + inaccurate_total = sum(int(r.get("inaccurate", 0) or 0) for r in evaluated_rules) + coverage = 1.0 if checked_total > 0 else 0.0 + + methods_used: List[str] = [] + columns_used: List[str] = [] + rules_applied: List[str] = [] + summary_table: List[Dict[str, Any]] = [] + + for r in acc_rule_results: + rule_index = r.get("rule_index") + column = r.get("column") + method = str(r.get("method", "N/A") or "N/A") + score = r.get("score") + checked = int(r.get("checked", 0) or 0) + accurate = int(r.get("accurate", 0) or 0) + inaccurate = int(r.get("inaccurate", 0) or 0) + cov = r.get("coverage") + mae = r.get("mae") + rmse = r.get("rmse") + message = r.get("message") + meta = r.get("meta") or {} + + if column not in (None, "(None)", "N/A"): + col_str = str(column) + if col_str not in columns_used: + columns_used.append(col_str) + else: + col_str = "N/A" + + if method not in methods_used: + methods_used.append(method) + + if method == 'Threshold (Min/Max)': + if isinstance(meta, dict): + desc = ( + f"{col_str} → Threshold " + f"(min={meta.get('min', 'N/A')}, max={meta.get('max', 'N/A')}, inclusive={meta.get('inclusive', 'N/A')})" + ) + else: + desc = f"{col_str} → Threshold" + elif method == 'Statistical Bounds (k-sigma)': + if isinstance(meta, dict): + desc = f"{col_str} → Bounds (k={meta.get('k', 'N/A')}, center={meta.get('center_mode', 'N/A')})" + else: + desc = f"{col_str} → Statistical Bounds" + else: + if message: + desc = f"{col_str} → Reference Mapping ({message})" + else: + desc = f"{col_str} → Reference Mapping" + rules_applied.append(desc) + + summary_table.append( + { + "Rule #": rule_index, + "Measured Column": col_str, + "Method": method, + "Score": f"{float(score):.2f}%" if score is not None else "N/A", + "Checked": checked, + "Accurate / In Rule": accurate, + "Inaccurate / Out of Rule": inaccurate, + "Coverage": f"{float(cov) * 100:.2f}%" if cov is not None else "N/A", + "MAE": f"{float(mae):.4f}" if mae is not None else "N/A", + "RMSE": f"{float(rmse):.4f}" if rmse is not None else "N/A", + "Status": message, + } + ) + + if len(methods_used) == 1: + task_label = methods_used[0] + elif methods_used: + task_label = 'Mixed Accuracy Rules' + else: + task_label = 'N/A' + + return { + "acc_rule_count": len(acc_rule_results), + "acc_checked_total": checked_total, + "acc_accurate_total": accurate_total, + "acc_inaccurate_total": inaccurate_total, + "acc_coverage": coverage, + "acc_methods_used": methods_used, + "acc_columns_used": columns_used, + "acc_rules_applied": rules_applied, + "acc_summary_table": summary_table, + "acc_task_label": task_label, + } + + +def build_timeliness_summary_payload(details: Dict[str, Any] | None) -> Dict[str, Any]: + """ + Build a summary-friendly payload for Timeliness from raw timeliness detail values. + + This helper is intended for pipeline/backend use so that dashboard summary + rendering does not need to reinterpret reference mode/profile labels on the + frontend. + """ + details = details or {} + + reference_mode = details.get("t_reference_mode", "column") + reference_time = details.get("t_reference_time", "") + event_col = details.get("t_event", "N/A") + system_col = details.get("t_system", "N/A") + p95_latency = details.get("t_p95_latency") + + if reference_mode == "fixed_time": + profile_mode = "User-defined Reference Time" + profile_reference_label = "Reference Time" + profile_reference_value = reference_time if str(reference_time).strip() else "N/A" + else: + profile_mode = "System Time Column" + profile_reference_label = "System Time" + profile_reference_value = system_col + + return { + "t_profile_mode": profile_mode, + "t_profile_reference_label": profile_reference_label, + "t_profile_reference_value": profile_reference_value, + "t_profile_event_col": event_col, + "t_profile_p95_latency": p95_latency, + } + + +def build_completeness_summary_payload( + df: pd.DataFrame, + used_cols: List[str] | None, + missing_tokens: List[str] | None, +) -> Dict[str, Any]: + """ + Build a summary-friendly payload for Completeness. + + This helper is intended for pipeline/backend use so that dashboard summary + rendering does not need to recompute completeness-by-column on the frontend. + """ + used_cols = used_cols or df.columns.tolist() + used_cols = [c for c in used_cols if c in df.columns] + missing_tokens = missing_tokens or [] + + mm = _build_missing_mask(df, used_cols, missing_tokens) + + if not mm.empty: + comp_series = (1 - mm.mean(axis=0)) * 100 + else: + comp_series = pd.Series(dtype=float) + + comp_df = comp_series.reset_index() + if not comp_df.empty: + comp_df.columns = ["Column", "Score"] + comp_df = comp_df.sort_values("Score", ascending=True) + comp_summary_table = [ + { + "Column": str(row["Column"]), + "Score": float(row["Score"]), + "Score Label": f"{float(row['Score']):.1f}%", + } + for _, row in comp_df.iterrows() + ] + else: + comp_summary_table = [] + + return { + "comp_used_cols": used_cols, + "comp_missing_tokens": missing_tokens, + "comp_scope_count": len(used_cols), + "comp_summary_table": comp_summary_table, + } diff --git a/etsi_dq/io.py b/etsi_dq/io.py new file mode 100644 index 0000000000000000000000000000000000000000..d0e6837568dd7ea10cd3b16e73a72512f3904b4f --- /dev/null +++ b/etsi_dq/io.py @@ -0,0 +1,81 @@ +import io +import os +import zipfile + +import pandas as pd + +def _load_from_zip(zip_bytes, sheet_name=None): + """ + ZIP 파일 내부의 첫 번째 지원 파일(CSV/XLSX/XLS)을 찾아 데이터프레임으로 로드 + """ + try: + with zipfile.ZipFile(zip_bytes) as zf: + supported = [ + name for name in zf.namelist() + if not name.endswith('/') and name.lower().endswith(('.csv', '.xlsx', '.xls')) + ] + + if not supported: + print("[에러] ZIP 내부에 지원 가능한 데이터 파일(CSV/XLSX/XLS)이 없습니다.") + return None + + target_name = supported[0] + with zf.open(target_name) as f: + file_bytes = f.read() + + if target_name.lower().endswith('.csv'): + df = pd.read_csv(io.BytesIO(file_bytes)) + else: + df = pd.read_excel(io.BytesIO(file_bytes), sheet_name=sheet_name, engine='openpyxl') + + print(f"[성공] ZIP 내부 파일 '{target_name}'을 불러왔습니다. {len(df)}행, {len(df.columns)}열") + return df + + except Exception as e: + print(f"[에러] ZIP 파일을 읽을 수 없습니다: {e}") + return None + +def load_dataset(source, filename=None, sheet_name=None): + """ + 파일 경로 또는 BytesIO를 받아서 데이터프레임으로 변환 (CSV/XLSX/XLS/ZIP 지원) + - source: 파일 경로(str) 또는 BytesIO 객체 + - filename: BytesIO일 때 확장자 판단용 (예: "data.csv") + """ + try: + # 확장자 판단 + if isinstance(source, str): + if not os.path.exists(source): + print(f"[에러] 파일이 없습니다: {source}") + return None + ext = source.lower() + else: + # BytesIO인 경우 filename으로 확장자 판단 + if filename is None: + print("[에러] BytesIO 입력 시 filename이 필요합니다.") + return None + ext = filename.lower() + + # 확장자에 따라 읽기 + if ext.endswith('.csv'): + df = pd.read_csv(source) + elif ext.endswith('.xlsx') or ext.endswith('.xls'): + df = pd.read_excel(source, sheet_name=sheet_name, engine='openpyxl') + elif ext.endswith('.zip'): + if isinstance(source, str): + with open(source, 'rb') as f: + zip_bytes = io.BytesIO(f.read()) + else: + if hasattr(source, 'seek'): + source.seek(0) + zip_bytes = io.BytesIO(source.read()) + return _load_from_zip(zip_bytes, sheet_name=sheet_name) + else: + print("[에러] 지원하지 않는 파일 형식입니다.") + return None + + print(f"[성공] 데이터를 불러왔습니다. {len(df)}행, {len(df.columns)}열") + return df + + except Exception as e: + print(f"[에러] 파일을 읽을 수 없습니다: {e}") + return None diff --git a/etsi_dq/metrics/__init__.py b/etsi_dq/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..858f54340e9e10534b1d5f84874702abc233fb68 --- /dev/null +++ b/etsi_dq/metrics/__init__.py @@ -0,0 +1,19 @@ +# etsi_dq/metrics/__init__.py + +# Expose specific metrics to the outside world +from .completeness import check_completeness +from .accuracy import check_accuracy +from .timeliness import check_timeliness +from .consistency import check_consistency +from .reliability import check_reliability +from .uniqueness import check_uniqueness + +# Define what is available when using "from etsi_dq.metrics import *" +__all__ = [ + "check_completeness", + "check_accuracy", + "check_timeliness", + "check_consistency", + "check_reliability", + "check_uniqueness" +] diff --git a/etsi_dq/metrics/accuracy.py b/etsi_dq/metrics/accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..431b40fa51d6fec26a3805e5d7977dd819811733 --- /dev/null +++ b/etsi_dq/metrics/accuracy.py @@ -0,0 +1,378 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Optional + +import numpy as np +import pandas as pd + +from etsi_dq.utils import ( + SENTINEL_MISSING_VALUES, + _likely_unit_mismatch, + compute_accuracy, +) + + +@dataclass +class AccuracyRuleResult: + rule_index: int + column: Optional[str] + method: str + score: Optional[float] + checked: int + accurate: int + inaccurate: int + coverage: float + mae: Optional[float] + rmse: Optional[float] + meta: Optional[dict] + message: Optional[str] = None + matched_rows: int = 0 + + +@dataclass +class AccuracyAggregateResult: + score_percent: Optional[float] + rule_results: list[dict[str, Any]] + total_checked: int + total_accurate: int + total_inaccurate: int + last_mae: Optional[float] + last_rmse: Optional[float] + last_coverage: float + last_meta: Optional[dict] + last_calib_params: Optional[dict] + last_target: Optional[str] + last_true: Optional[str] + last_ref_mode: str + last_ref_value_col: Optional[str] + last_meas_keys: list[str] + last_ref_keys: list[str] + last_matched_rows: int + na_reason: Optional[str] = None + + +def _to_rule_dict(result: AccuracyRuleResult) -> dict[str, Any]: + return { + "rule_index": result.rule_index, + "column": result.column, + "method": result.method, + "score": result.score, + "checked": result.checked, + "accurate": result.accurate, + "inaccurate": result.inaccurate, + "coverage": result.coverage, + "mae": result.mae, + "rmse": result.rmse, + "meta": result.meta, + "message": result.message, + "matched_rows": result.matched_rows, + } + + +def evaluate_accuracy_rule( + df: pd.DataFrame, + rule_index: int, + rule: dict[str, Any], + config: Optional[dict[str, Any]] = None, + ref_df: Optional[pd.DataFrame] = None, +) -> AccuracyRuleResult: + """Evaluate a single Accuracy rule using the current dashboard logic.""" + config = config or {} + + acc_method = rule.get("method", "Reference Mapping") + meas_col = rule.get("column") + ref_mode = rule.get("ref_mode", "Column in Dataset") + reference_col = rule.get("reference_column", "(None)") + ref_value_col = rule.get("reference_value_col", "(None)") + meas_keys = rule.get("meas_keys") or [] + ref_keys = rule.get("ref_keys") or [] + acc_mode = rule.get("mode", config.get("acc_mode", "Direct Compare")) + tol_type = rule.get("tol_type", config.get("acc_tol_type", "absolute")) + tol_value = float(rule.get("tol_value", config.get("acc_tol_value", 0.0)) or 0.0) + + result = AccuracyRuleResult( + rule_index=rule_index, + column=meas_col, + method=acc_method, + score=None, + checked=0, + accurate=0, + inaccurate=0, + coverage=0.0, + mae=None, + rmse=None, + meta=None, + message=None, + matched_rows=0, + ) + + if acc_method == "Threshold (Min/Max)": + if meas_col in (None, "(None)"): + result.message = "Measured column is required." + return result + + y = pd.to_numeric(df[meas_col], errors="coerce") + try: + y = y.replace(list(SENTINEL_MISSING_VALUES), np.nan) + except Exception: + pass + + valid = y.notna() + checked = int(valid.sum()) + if checked == 0: + result.message = "No valid numeric values found in the selected column." + return result + + vmin = float(rule.get("thr_min", config.get("acc_thr_min", 0.0)) or 0.0) + vmax = float(rule.get("thr_max", config.get("acc_thr_max", 1.0)) or 1.0) + inclusive = bool(rule.get("thr_inclusive", config.get("acc_thr_inclusive", True))) + + if inclusive: + ok_mask = (y[valid] >= vmin) & (y[valid] <= vmax) + else: + ok_mask = (y[valid] > vmin) & (y[valid] < vmax) + + ok_cnt = int(ok_mask.sum()) + bad_cnt = int(checked - ok_cnt) + result.score = float((ok_cnt / checked) * 100.0) + result.checked = checked + result.accurate = ok_cnt + result.inaccurate = bad_cnt + result.coverage = (checked / len(df)) if len(df) else 0.0 + result.meta = { + "task": "threshold", + "min": vmin, + "max": vmax, + "inclusive": inclusive, + } + return result + + if acc_method == "Statistical Bounds (k-sigma)": + if meas_col in (None, "(None)"): + result.message = "Measured column is required." + return result + + y = pd.to_numeric(df[meas_col], errors="coerce") + try: + y = y.replace(list(SENTINEL_MISSING_VALUES), np.nan) + except Exception: + pass + + valid = y.notna() + checked = int(valid.sum()) + if checked == 0: + result.message = "No valid numeric values found in the selected column." + return result + + k = float(rule.get("sigma_k", config.get("acc_sigma_k", 3.0)) or 3.0) + center_mode = str(rule.get("sigma_center", config.get("acc_sigma_center", "mean"))) + yv = y[valid].astype(float) + + center = float(yv.median()) if center_mode == "median" else float(yv.mean()) + std = float(yv.std(ddof=0)) + + if std == 0.0: + low, high = center, center + ok_cnt = checked + bad_cnt = 0 + else: + low = center - k * std + high = center + k * std + ok_mask = (yv >= low) & (yv <= high) + ok_cnt = int(ok_mask.sum()) + bad_cnt = int(checked - ok_cnt) + + result.score = float((ok_cnt / checked) * 100.0) + result.checked = checked + result.accurate = ok_cnt + result.inaccurate = bad_cnt + result.coverage = (checked / len(df)) if len(df) else 0.0 + result.meta = { + "task": "bounds", + "k": k, + "center_mode": center_mode, + "center": center, + "std": std, + "low": low, + "high": high, + } + return result + + # Reference Mapping + if ref_mode == "Column in Dataset": + if meas_col in (None, "(None)") or reference_col in (None, "(None)"): + result.message = "Measured and reference columns are required." + return result + if meas_col == reference_col: + result.message = "Measured and reference columns must be different." + return result + if acc_mode == "Direct Compare" and _likely_unit_mismatch(meas_col, reference_col): + result.message = "Measured and reference columns appear to use different units/scales." + return result + + score, mae, rmse, checked, acc_cnt, inacc_cnt, calib_params = compute_accuracy( + df, + measured_col=meas_col, + true_col=reference_col, + mode=acc_mode, + tolerance_type=tol_type, + tolerance_value=tol_value, + ) + result.score = score + result.checked = checked + result.accurate = acc_cnt + result.inaccurate = inacc_cnt + result.coverage = (checked / len(df)) if len(df) else 0.0 + result.mae = mae + result.rmse = rmse + result.meta = calib_params if isinstance(calib_params, dict) else {"task": "regression", "calib_params": calib_params} + return result + + # External reference dataset + if ref_df is None: + result.message = "External reference file is required." + return result + if meas_col in (None, "(None)"): + result.message = "Measured column is required." + return result + if ref_value_col in (None, "(None)"): + result.message = "Reference value column is required." + return result + if not meas_keys or not ref_keys or len(meas_keys) != len(ref_keys): + result.message = "Matching key columns are required in both datasets." + return result + + missing_meas = [k for k in meas_keys if k not in df.columns] + missing_ref = [k for k in ref_keys if k not in ref_df.columns] + if missing_meas or missing_ref: + result.message = f"Missing key columns. measured missing={missing_meas}, reference missing={missing_ref}" + return result + + meas_join = df[meas_keys + [meas_col]].copy() + ref_join = ref_df[ref_keys + [ref_value_col]].copy() + key_map = {rk: mk for rk, mk in zip(ref_keys, meas_keys)} + ref_join = ref_join.rename(columns=key_map) + merged = meas_join.merge(ref_join, on=meas_keys, how="inner") + + if merged.empty: + result.message = "No matched pairs found after key-based join." + return result + + merged = merged.rename(columns={meas_col: "__acc_measured", ref_value_col: "__acc_true"}) + score, mae, rmse, checked, acc_cnt, inacc_cnt, calib_params = compute_accuracy( + merged, + measured_col="__acc_measured", + true_col="__acc_true", + mode=acc_mode, + tolerance_type=tol_type, + tolerance_value=tol_value, + ) + result.score = score + result.checked = checked + result.accurate = acc_cnt + result.inaccurate = inacc_cnt + result.coverage = (checked / len(df)) if len(df) else 0.0 + result.mae = mae + result.rmse = rmse + result.meta = calib_params if isinstance(calib_params, dict) else {"task": "regression", "calib_params": calib_params} + result.matched_rows = int(len(merged)) + return result + + +def check_accuracy( + df: pd.DataFrame, + acc_rules: Optional[list[dict[str, Any]]] = None, + config: Optional[dict[str, Any]] = None, + ref_df: Optional[pd.DataFrame] = None, +) -> AccuracyAggregateResult: + """Evaluate multi-rule Accuracy and aggregate by micro-average.""" + config = config or {} + rules = acc_rules or [] + + if not rules: + rules = [{ + "column": config.get("target"), + "method": config.get("acc_method", "Reference Mapping"), + "ref_mode": config.get("acc_ref_mode", "Column in Dataset"), + "reference_column": config.get("true"), + "reference_value_col": config.get("acc_ref_value_col", "(None)"), + "meas_keys": config.get("acc_meas_keys") or [], + "ref_keys": config.get("acc_ref_keys") or [], + "mode": config.get("acc_mode", "Direct Compare"), + "tol_type": config.get("acc_tol_type", "absolute"), + "tol_value": float(config.get("acc_tol_value", 0.0) or 0.0), + "thr_min": float(config.get("acc_thr_min", 0.0) or 0.0), + "thr_max": float(config.get("acc_thr_max", 1.0) or 1.0), + "thr_inclusive": bool(config.get("acc_thr_inclusive", True)), + "sigma_k": float(config.get("acc_sigma_k", 3.0) or 3.0), + "sigma_center": str(config.get("acc_sigma_center", "mean")), + }] + + rule_results: list[AccuracyRuleResult] = [] + total_checked = 0 + total_accurate = 0 + last_meta = None + last_mae = None + last_rmse = None + last_checked = 0 + last_acc_cnt = 0 + last_inacc_cnt = 0 + last_coverage = 0.0 + last_calib_params = None + last_matched_rows = 0 + last_target = config.get("target") + last_true = config.get("true") + last_ref_mode = config.get("acc_ref_mode", "Column in Dataset") + last_ref_value_col = config.get("acc_ref_value_col", "(None)") + last_meas_keys = config.get("acc_meas_keys") or [] + last_ref_keys = config.get("acc_ref_keys") or [] + + for idx, rule in enumerate(rules, start=1): + rr = evaluate_accuracy_rule(df, idx, rule, config=config, ref_df=ref_df) + rule_results.append(rr) + + if rr.score is not None: + total_checked += int(rr.checked or 0) + total_accurate += int(rr.accurate or 0) + last_meta = rr.meta + last_mae = rr.mae + last_rmse = rr.rmse + last_checked = int(rr.checked or 0) + last_acc_cnt = int(rr.accurate or 0) + last_inacc_cnt = int(rr.inaccurate or 0) + last_coverage = float(rr.coverage or 0.0) + last_target = rule.get("column") + last_ref_mode = rule.get("ref_mode", "Column in Dataset") + last_calib_params = rr.meta + if last_ref_mode == "Column in Dataset": + last_true = rule.get("reference_column", last_true) + else: + last_matched_rows = int(rr.matched_rows or 0) + last_ref_value_col = rule.get("reference_value_col", last_ref_value_col) + last_meas_keys = rule.get("meas_keys") or [] + last_ref_keys = rule.get("ref_keys") or [] + + overall_score = float((total_accurate / total_checked) * 100.0) if total_checked > 0 else None + first_reason = next((r.message for r in rule_results if r.message), None) + + return AccuracyAggregateResult( + score_percent=overall_score, + rule_results=[_to_rule_dict(r) for r in rule_results], + total_checked=int(total_checked), + total_accurate=int(total_accurate), + total_inaccurate=int(total_checked - total_accurate), + last_mae=last_mae, + last_rmse=last_rmse, + last_coverage=last_coverage, + last_meta=last_meta if isinstance(last_meta, dict) else {"task": "unknown"}, + last_calib_params=last_calib_params, + last_target=last_target, + last_true=last_true, + last_ref_mode=last_ref_mode, + last_ref_value_col=last_ref_value_col, + last_meas_keys=list(last_meas_keys), + last_ref_keys=list(last_ref_keys), + last_matched_rows=int(last_matched_rows), + na_reason=(first_reason or "No valid accuracy rule could be evaluated.") if overall_score is None else None, + ) diff --git a/etsi_dq/metrics/completeness.py b/etsi_dq/metrics/completeness.py new file mode 100644 index 0000000000000000000000000000000000000000..f2f5be5ab76485d3d716082c041d1f60028787eb --- /dev/null +++ b/etsi_dq/metrics/completeness.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +import pandas as pd + +from etsi_dq.utils import _build_missing_mask, _parse_missing_tokens + + +@dataclass +class CompletenessResult: + score_percent: float + dataset_score: float + row_score: float + column_avg_score: float + missing_cells: int + complete_rows: int + incomplete_rows: int + total_rows: int + used_cols: list[str] + missing_tokens: list[str] + granularity_used: str + col_scores: dict + na_reason: Optional[str] = None + + +def check_completeness( + df: pd.DataFrame, + required_cols: Optional[list[str]] = None, + missing_text: str = "", + granularity: str = "Dataset", +) -> CompletenessResult: + """Compute Completeness using the current dashboard logic. + + Supports: + - required column subset selection + - user-defined missing-value tokens + - granularity-based main score selection (Dataset / Row / Column) + """ + if df is None: + raise ValueError("Dataset is None.") + + req_cols = required_cols or [] + miss_tokens = _parse_missing_tokens(missing_text) + gran = str(granularity or "Dataset") + + used_cols = req_cols if req_cols else df.columns.tolist() + used_cols = [c for c in used_cols if c in df.columns] + + mm = _build_missing_mask(df, used_cols, miss_tokens) + + total_cells = int(mm.size) if not mm.empty else 0 + missing_cells = int(mm.sum().sum()) if not mm.empty else 0 + dataset_score = float((1 - (missing_cells / total_cells)) * 100) if total_cells else 0.0 + + total_rows = int(len(df)) + incomplete_rows = int(mm.any(axis=1).sum()) if not mm.empty else 0 + complete_rows = int(total_rows - incomplete_rows) + row_score = float((complete_rows / total_rows) * 100) if total_rows else 0.0 + + col_scores = (1 - mm.mean(axis=0)) * 100 if not mm.empty else pd.Series(dtype=float) + column_avg_score = float(col_scores.mean()) if len(col_scores) else 0.0 + + if gran == "Row": + main_score = row_score + elif gran == "Column": + main_score = column_avg_score + else: + main_score = dataset_score + + return CompletenessResult( + score_percent=float(main_score), + dataset_score=float(dataset_score), + row_score=float(row_score), + column_avg_score=float(column_avg_score), + missing_cells=int(missing_cells), + complete_rows=int(complete_rows), + incomplete_rows=int(incomplete_rows), + total_rows=int(total_rows), + used_cols=list(used_cols), + missing_tokens=list(miss_tokens), + granularity_used=gran, + col_scores=col_scores.to_dict() if hasattr(col_scores, "to_dict") else {}, + na_reason=None, + ) diff --git a/etsi_dq/metrics/consistency.py b/etsi_dq/metrics/consistency.py new file mode 100644 index 0000000000000000000000000000000000000000..94fe3ef3da1f2b45e572b9932a065abfd9ea83aa --- /dev/null +++ b/etsi_dq/metrics/consistency.py @@ -0,0 +1,317 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Optional + +import pandas as pd + +from etsi_dq.utils import safe_consistency_violations + + +@dataclass +class ConsistencyRuleResult: + rule_index: int + mode: str + score: Optional[float] + column: Optional[str] + violation_count: int + match_count: int + rule_used: Optional[str] + message: Optional[str] = None + regex_pattern_used: Optional[str] = None + allow_empty: bool = True + allow_empty_note: Optional[str] = None + violation_sample: Optional[list[dict[str, Any]]] = None + match_sample: Optional[list[dict[str, Any]]] = None + + +@dataclass +class ConsistencyAggregateResult: + score_percent: Optional[float] + rule_results: list[dict[str, Any]] + total_evaluated: int + total_passed: int + total_violations: int + last_mode_used: Optional[str] + last_rule_used: Optional[str] + last_total_rows: int + last_violation_count: int + last_match_count: int + last_violation_sample: list[dict[str, Any]] + last_match_sample: list[dict[str, Any]] + last_regex_pattern: Optional[str] + last_format_used: Optional[str] + last_format_col: Optional[str] + last_allow_empty: bool + na_reason: Optional[str] = None + + +def _to_rule_dict(result: ConsistencyRuleResult) -> dict[str, Any]: + return { + "rule_index": result.rule_index, + "mode": result.mode, + "score": result.score, + "column": result.column, + "violation_count": result.violation_count, + "match_count": result.match_count, + "rule_used": result.rule_used, + "message": result.message, + "regex_pattern_used": result.regex_pattern_used, + "allow_empty": result.allow_empty, + "allow_empty_note": result.allow_empty_note, + "violation_sample": result.violation_sample or [], + "match_sample": result.match_sample or [], + } + + +def evaluate_consistency_rule( + df: pd.DataFrame, + rule_index: int, + rule_cfg: dict[str, Any], + ref_df: Optional[pd.DataFrame] = None, +) -> ConsistencyRuleResult: + """Evaluate a single Consistency rule using the current dashboard logic.""" + cons_mode = rule_cfg.get("mode", "Expression") + result = ConsistencyRuleResult( + rule_index=rule_index, + mode=cons_mode, + score=None, + column=rule_cfg.get("column"), + violation_count=0, + match_count=0, + rule_used=None, + message=None, + regex_pattern_used=None, + allow_empty=True, + allow_empty_note=None, + violation_sample=[], + match_sample=[], + ) + + if cons_mode == "Format preset": + col = rule_cfg.get("column", "(None)") + preset = rule_cfg.get("preset", "email") + allow_empty = bool(rule_cfg.get("allow_empty", True)) + result.allow_empty = allow_empty + result.allow_empty_note = ( + "enabled (empty/null → PASS)" if allow_empty else "disabled (empty/null → violation)" + ) + + if col in (None, "(None)"): + result.message = "Column for format validation is required." + return result + + s = df[col] + s_str = s.astype("string") + valid_mask = (s_str.notna() & (s_str.str.strip() != "")) if allow_empty else s_str.notna() + + if preset == "email": + pattern = r"^[^@\s]+@[^@\s]+\.[^@\s]+$" + elif preset == "date": + pattern = r"^\d{4}[-/]\d{2}[-/]\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?$" + elif preset == "numeric_only": + pattern = r"^[+-]?\d+(\.\d+)?$" + elif preset == "letters_only": + pattern = r"^[A-Za-z]+$" + elif preset == "alphanumeric": + pattern = r"^[A-Za-z0-9]+$" + elif preset == "slash_format": + pattern = r"^[A-Za-z0-9/._-]+$" + elif preset == "custom_regex": + pattern = str(rule_cfg.get("custom_regex", "") or "") + else: + pattern = r".*" + + result.regex_pattern_used = pattern + + if preset == "custom_regex" and pattern.strip() == "": + result.message = "Custom regex pattern is required." + return result + + pass_mask = pd.Series(False, index=df.index) + pass_mask.loc[valid_mask] = s_str.loc[valid_mask].str.match(pattern, na=False) + if allow_empty: + empty_mask = s_str.isna() | (s_str.str.strip() == "") + pass_mask = pass_mask | empty_mask + + violations = df.loc[~pass_mask].copy() + match_rows_df = df.loc[pass_mask].copy() + viol_count = int(len(violations)) + total_rows = int(len(df)) + score = float(((total_rows - viol_count) / total_rows) * 100) if total_rows else None + + result.score = score + result.column = col + result.violation_count = viol_count + result.match_count = int(pass_mask.sum()) + result.rule_used = ( + "email" if preset == "email" else + "date (date or datetime)" if preset == "date" else + "numeric_only (signed integers/decimals)" if preset == "numeric_only" else + "letters_only (A-Z, a-z only)" if preset == "letters_only" else + "alphanumeric (A-Z, a-z, 0-9)" if preset == "alphanumeric" else + "slash_format (A-Z, a-z, 0-9, / . _ -)" if preset == "slash_format" else + "custom_regex" + ) + result.violation_sample = violations.head(50).to_dict(orient="records") + result.match_sample = match_rows_df.head(50).to_dict(orient="records") + return result + + if cons_mode == "Referential Integrity": + src_col = rule_cfg.get("source_column", "(None)") + ref_col = rule_cfg.get("reference_column", "(None)") + allow_empty = bool(rule_cfg.get("ref_allow_empty", True)) + result.allow_empty = allow_empty + result.allow_empty_note = ( + "enabled (empty/null → PASS)" if allow_empty else "disabled (empty/null → violation)" + ) + + if src_col in (None, "(None)"): + result.message = "Source key column is required." + return result + if ref_df is None: + result.message = "Reference dataset is required." + return result + if ref_col in (None, "(None)") or ref_col not in ref_df.columns: + result.message = "Valid reference key column is required." + return result + + src = df[src_col].astype("string") + ref_keys = ref_df[ref_col].astype("string") + ref_key_set = set(ref_keys.dropna().astype(str).str.strip().tolist()) + src_clean = src.astype("string").str.strip() + pass_mask = src_clean.isin(ref_key_set) + if allow_empty: + empty_mask = src.isna() | (src_clean == "") + pass_mask = pass_mask | empty_mask + + violations = df.loc[~pass_mask].copy() + match_rows_df = df.loc[pass_mask].copy() + viol_count = int(len(violations)) + total_rows = int(len(df)) + score = float(((total_rows - viol_count) / total_rows) * 100) if total_rows else None + + result.score = score + result.column = src_col + result.violation_count = viol_count + result.match_count = int(pass_mask.sum()) + result.rule_used = f"`{src_col}` exists in reference key `{ref_col}`" + result.violation_sample = violations.head(50).to_dict(orient="records") + result.match_sample = match_rows_df.head(50).to_dict(orient="records") + return result + + # Expression / Rule Builder both use rule text evaluation + rule = rule_cfg.get("rule", "") + v_df, err = safe_consistency_violations(df, rule) + if err: + result.message = str(err) + return result + + v_count = int(len(v_df)) + total_rows = int(len(df)) + score = float(((total_rows - v_count) / total_rows) * 100) if total_rows else None + try: + match_df = df.drop(v_df.index, errors="ignore") + except Exception: + match_df = df.copy() + + result.score = score + result.column = rule_cfg.get("column") + result.violation_count = v_count + result.match_count = int(total_rows - v_count) + result.rule_used = str(rule) + result.violation_sample = v_df.head(50).to_dict(orient="records") + result.match_sample = match_df.head(50).to_dict(orient="records") + return result + + +def check_consistency( + df: pd.DataFrame, + cons_rules: Optional[list[dict[str, Any]]] = None, + config: Optional[dict[str, Any]] = None, + ref_df: Optional[pd.DataFrame] = None, +) -> ConsistencyAggregateResult: + """Evaluate multi-rule Consistency and aggregate by micro-average.""" + config = config or {} + rules = cons_rules or [] + + if not rules: + rules = [{ + "mode": config.get("cons_mode", "Expression"), + "column": config.get("cons_format_col", "(None)"), + "preset": config.get("cons_format_preset", "email"), + "custom_regex": config.get("cons_custom_regex", ""), + "allow_empty": bool(config.get("cons_allow_empty", True)), + "source_column": config.get("cons_ref_src_col", "(None)"), + "reference_column": config.get("cons_ref_ref_col", "(None)"), + "ref_allow_empty": bool(config.get("cons_ref_allow_empty", True)), + "rule": config.get("rule", ""), + }] + + rule_results: list[ConsistencyRuleResult] = [] + total_evaluated = 0 + total_passed = 0 + last_mode_used = None + last_rule_used = None + last_total_rows = int(len(df)) + last_violation_count = 0 + last_match_count = 0 + last_violation_sample: list[dict[str, Any]] = [] + last_match_sample: list[dict[str, Any]] = [] + last_regex_pattern = None + last_format_used = None + last_format_col = None + last_allow_empty = True + + for idx, rule_cfg in enumerate(rules, start=1): + rr = evaluate_consistency_rule(df, idx, rule_cfg, ref_df=ref_df) + rule_results.append(rr) + + if rr.score is not None: + rule_passed = int(rr.match_count or 0) + rule_violations = int(rr.violation_count or 0) + rule_evaluated = rule_passed + rule_violations + total_passed += rule_passed + total_evaluated += rule_evaluated + + cons_mode = rule_cfg.get("mode", "Expression") + last_mode_used = ( + "rule_builder" if cons_mode == "Rule Builder" else + ("format" if cons_mode == "Format preset" else + ("referential_integrity" if cons_mode == "Referential Integrity" else "expression")) + ) + last_rule_used = rr.rule_used + last_total_rows = int(len(df)) + last_violation_count = rule_violations + last_match_count = rule_passed + last_violation_sample = rr.violation_sample or [] + last_match_sample = rr.match_sample or [] + last_regex_pattern = rr.regex_pattern_used + last_format_used = rule_cfg.get("preset") if cons_mode == "Format preset" else ( + "referential_integrity" if cons_mode == "Referential Integrity" else None + ) + last_format_col = rr.column + last_allow_empty = bool(rr.allow_empty) + + overall_score = float((total_passed / total_evaluated) * 100.0) if total_evaluated > 0 else None + first_reason = next((r.message for r in rule_results if r.message), None) + + return ConsistencyAggregateResult( + score_percent=overall_score, + rule_results=[_to_rule_dict(r) for r in rule_results], + total_evaluated=int(total_evaluated), + total_passed=int(total_passed), + total_violations=int(total_evaluated - total_passed), + last_mode_used=last_mode_used, + last_rule_used=last_rule_used, + last_total_rows=int(last_total_rows), + last_violation_count=int(last_violation_count), + last_match_count=int(last_match_count), + last_violation_sample=list(last_violation_sample), + last_match_sample=list(last_match_sample), + last_regex_pattern=last_regex_pattern, + last_format_used=last_format_used, + last_format_col=last_format_col, + last_allow_empty=bool(last_allow_empty), + na_reason=(first_reason or "No valid consistency rule could be evaluated.") if overall_score is None else None, + ) diff --git a/etsi_dq/metrics/reliability.py b/etsi_dq/metrics/reliability.py new file mode 100644 index 0000000000000000000000000000000000000000..01ec2a3daf03316a7d429f7fce18786b657f5769 --- /dev/null +++ b/etsi_dq/metrics/reliability.py @@ -0,0 +1,64 @@ +from __future__ import annotations +import numpy as np +import pandas as pd +from dataclasses import dataclass + +@dataclass +class ReliabilityResult: + cv_percent: float | None # CV(%). Lower means more stable (more reliable) + mean: float | None + std: float | None + n_valid: int + + +def check_reliability_cv(series: pd.Series) -> ReliabilityResult: + """[Metric 2] Reliability (Temporal Stability) — CV-based + + ETSI TR 104 180 v0.0.7 Clause 5.2 example: + CV = (std / mean) * 100 + + Notes: + - Coerces to numeric and drops NaN. + - If mean == 0, CV is undefined -> returns cv_percent=None. + """ + s = pd.to_numeric(series, errors="coerce").dropna() + n = int(len(s)) + if n == 0: + return ReliabilityResult(None, None, None, 0) + + mean = float(s.mean()) + std = float(s.std(ddof=1)) if n >= 2 else 0.0 + + if mean == 0 or np.isnan(mean): + return ReliabilityResult(None, mean, std, n) + + cv = float((std / mean) * 100.0) + return ReliabilityResult(cv, mean, std, n) + + +def cv_to_score(cv_percent: float | None, thresholds=(1.0, 5.0, 10.0)) -> float | None: + """Optional helper: convert CV(%) to a 0–100 score by thresholds. + + thresholds = (excellent, good, acceptable) + Lower CV is better. + """ + if cv_percent is None: + return None + ex, good, acc = thresholds + if cv_percent <= ex: + return 100.0 + if cv_percent <= good: + return 80.0 + if cv_percent <= acc: + return 60.0 + return 40.0 + + +def check_reliability(series: pd.Series) -> float | None: + """Compatibility wrapper. + + Returns the coefficient of variation (CV%) as a float. + Returns None if CV cannot be computed (no valid numeric values or mean == 0). + """ + r = check_reliability_cv(series) + return r.cv_percent \ No newline at end of file diff --git a/etsi_dq/metrics/timeliness.py b/etsi_dq/metrics/timeliness.py new file mode 100644 index 0000000000000000000000000000000000000000..6d3ac6b29441a7da086fe97d04b14882d30d97cb --- /dev/null +++ b/etsi_dq/metrics/timeliness.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +import pandas as pd +from pandas.api.types import is_datetime64_any_dtype + +from etsi_dq.utils import compute_timeliness_latency + + +@dataclass +class TimelinessResult: + score_percent: Optional[float] + avg_latency_seconds: Optional[float] + min_latency_seconds: Optional[float] + max_latency_seconds: Optional[float] + p95_latency_seconds: Optional[float] + total_rows: int + valid_pairs: int + negative_latency_pairs: int + event_time_nat: int + system_time_nat: int + auto_swapped: bool + sla_seconds: float + event_col: Optional[str] + system_col: Optional[str] + reference_mode: str = "column" + reference_time: Optional[str] = None + na_reason: Optional[str] = None + + +def check_timeliness( + df: pd.DataFrame, + event_col: str, + system_col: Optional[str], + sla_seconds: float, + reference_mode: str = "column", + reference_time: Optional[str] = None, +) -> TimelinessResult: + """Compute Timeliness using the current dashboard logic. + + Timeliness is evaluated as the percentage of valid non-negative latency pairs + whose latency is within the configured SLA threshold. + + Returns a structured result rather than a raw mean latency so the metric layer + can support summary/detail rendering and N/A reasoning consistently. + """ + if df is None or df.empty: + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=0, + valid_pairs=0, + negative_latency_pairs=0, + event_time_nat=0, + system_time_nat=0, + auto_swapped=False, + sla_seconds=float(sla_seconds or 0.0), + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="Dataset is empty.", + ) + + if event_col in (None, "(None)"): + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=int(len(df)), + valid_pairs=0, + negative_latency_pairs=0, + event_time_nat=0, + system_time_nat=0, + auto_swapped=False, + sla_seconds=float(sla_seconds or 0.0), + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="Event Time column is not selected.", + ) + + if reference_mode == "column" and system_col in (None, "(None)"): + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=int(len(df)), + valid_pairs=0, + negative_latency_pairs=0, + event_time_nat=0, + system_time_nat=0, + auto_swapped=False, + sla_seconds=float(sla_seconds or 0.0), + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="System Time column is not selected.", + ) + + if reference_mode == "column" and event_col == system_col: + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=int(len(df)), + valid_pairs=0, + negative_latency_pairs=0, + event_time_nat=0, + system_time_nat=0, + auto_swapped=False, + sla_seconds=float(sla_seconds or 0.0), + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="Event Time and System Time columns are the same.", + ) + + if reference_mode not in ("column", "fixed_time"): + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=int(len(df)), + valid_pairs=0, + negative_latency_pairs=0, + event_time_nat=0, + system_time_nat=0, + auto_swapped=False, + sla_seconds=float(sla_seconds or 0.0), + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="Invalid Timeliness reference mode.", + ) + + if reference_mode == "fixed_time": + ref_text = str(reference_time).strip() if reference_time is not None else "" + if not ref_text: + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=int(len(df)), + valid_pairs=0, + negative_latency_pairs=0, + event_time_nat=0, + system_time_nat=0, + auto_swapped=False, + sla_seconds=float(sla_seconds or 0.0), + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="Reference Time is not provided.", + ) + + t_event = pd.to_datetime(df[event_col], errors="coerce") + ref_ts = pd.to_datetime(ref_text, errors="coerce") + total_rows = int(len(df)) + event_nat = int(t_event.isna().sum()) + + if pd.isna(ref_ts): + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=total_rows, + valid_pairs=0, + negative_latency_pairs=0, + event_time_nat=event_nat, + system_time_nat=total_rows, + auto_swapped=False, + sla_seconds=float(sla_seconds or 0.0), + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="Reference Time could not be parsed as a valid datetime.", + ) + + if is_datetime64_any_dtype(t_event.dtype) and getattr(t_event.dt, "tz", None) is not None and ref_ts.tzinfo is None: + ref_ts = ref_ts.tz_localize(t_event.dt.tz) + elif is_datetime64_any_dtype(t_event.dtype) and getattr(t_event.dt, "tz", None) is None and ref_ts.tzinfo is not None: + ref_ts = ref_ts.tz_localize(None) + + lat = (pd.Series(ref_ts, index=df.index) - t_event).dt.total_seconds() + lat_clean = lat.dropna() + lat_clean_nonneg = lat_clean[lat_clean >= 0] + negative_latency_pairs = int((lat_clean < 0).sum()) + valid_pairs = int(lat_clean.shape[0]) + sla = float(sla_seconds or 0.0) + + if lat_clean_nonneg.empty or sla <= 0: + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=total_rows, + valid_pairs=valid_pairs, + negative_latency_pairs=negative_latency_pairs, + event_time_nat=event_nat, + system_time_nat=0, + auto_swapped=False, + sla_seconds=sla, + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="No valid non-negative latencies after parsing timestamps.", + ) + + return TimelinessResult( + score_percent=float((lat_clean_nonneg <= sla).mean() * 100.0), + avg_latency_seconds=float(lat_clean_nonneg.mean()), + min_latency_seconds=float(lat_clean_nonneg.min()), + max_latency_seconds=float(lat_clean_nonneg.max()), + p95_latency_seconds=float(lat_clean_nonneg.quantile(0.95)), + total_rows=total_rows, + valid_pairs=valid_pairs, + negative_latency_pairs=negative_latency_pairs, + event_time_nat=event_nat, + system_time_nat=0, + auto_swapped=False, + sla_seconds=sla, + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason=None, + ) + + lat, t_event, t_system, meta = compute_timeliness_latency( + df, + event_col=event_col, + system_col=system_col, + ) + + total_rows = int(len(df)) + event_nat = int(meta.get("event_time_NaT", int(t_event.isna().sum()))) + system_nat = int(meta.get("system_time_NaT", int(t_system.isna().sum()))) + valid_pairs = int(meta.get("valid_pairs_after_parse", int(lat.dropna().shape[0]))) + negative_latency_pairs = int(meta.get("negative_latency_pairs", int((lat.dropna() < 0).sum()))) + auto_swapped = bool(meta.get("auto_swapped", False)) + + lat_clean = lat.dropna() + lat_clean_nonneg = lat_clean[lat_clean >= 0] + sla = float(sla_seconds or 0.0) + + if lat_clean_nonneg.empty or sla <= 0: + return TimelinessResult( + score_percent=None, + avg_latency_seconds=None, + min_latency_seconds=None, + max_latency_seconds=None, + p95_latency_seconds=None, + total_rows=total_rows, + valid_pairs=valid_pairs, + negative_latency_pairs=negative_latency_pairs, + event_time_nat=event_nat, + system_time_nat=system_nat, + auto_swapped=auto_swapped, + sla_seconds=sla, + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason="No valid non-negative latencies after parsing timestamps.", + ) + + return TimelinessResult( + score_percent=float((lat_clean_nonneg <= sla).mean() * 100.0), + avg_latency_seconds=float(lat_clean_nonneg.mean()), + min_latency_seconds=float(lat_clean_nonneg.min()), + max_latency_seconds=float(lat_clean_nonneg.max()), + p95_latency_seconds=float(lat_clean_nonneg.quantile(0.95)), + total_rows=total_rows, + valid_pairs=valid_pairs, + negative_latency_pairs=negative_latency_pairs, + event_time_nat=event_nat, + system_time_nat=system_nat, + auto_swapped=auto_swapped, + sla_seconds=sla, + event_col=event_col, + system_col=system_col, + reference_mode=reference_mode, + reference_time=reference_time, + na_reason=None, + ) diff --git a/etsi_dq/metrics/uniqueness.py b/etsi_dq/metrics/uniqueness.py new file mode 100644 index 0000000000000000000000000000000000000000..c2e94f03cc832e858e22dcb999c4644f83b6a9bd --- /dev/null +++ b/etsi_dq/metrics/uniqueness.py @@ -0,0 +1,32 @@ +# metrics/uniqueness.py +from __future__ import annotations +import pandas as pd +from dataclasses import dataclass + +@dataclass +class UniquenessResult: + uniqueness_percent: float # 0~100 (높을수록 중복 적음) + duplicate_rows: int + total_rows: int + key_cols: list[str] # 어떤 기준으로 중복 판단했는지 기록 + +def check_uniqueness(df: pd.DataFrame, key_cols: list[str] | None = None) -> UniquenessResult: + """ + ETSI TR 104 180 v0.0.7 Clause 5.8: + Duplicate Record Rate = (N - N_unique) / N + Uniqueness Score = (1 - duplicate_rate) * 100 + - key_cols가 None이면 '전체 행' 기준(df.duplicated())으로 중복 판단 + - key_cols가 있으면 해당 컬럼 조합 기준으로 중복 판단(df.duplicated(subset=key_cols)) + """ + total = int(len(df)) + if total == 0: + return UniquenessResult(0.0, 0, 0, key_cols or []) + + if key_cols: + dup_mask = df.duplicated(subset=key_cols, keep="first") + else: + dup_mask = df.duplicated(keep="first") + + dup_cnt = int(dup_mask.sum()) + score = float((1 - (dup_cnt / total)) * 100.0) + return UniquenessResult(score, dup_cnt, total, key_cols or []) diff --git a/etsi_dq/pipeline.py b/etsi_dq/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..41b8a7cad519215ddef735c5699b741409e9a053 --- /dev/null +++ b/etsi_dq/pipeline.py @@ -0,0 +1,573 @@ +from typing import Any, Dict + +import pandas as pd + +from etsi_dq.schemas import AnalysisRequest, AnalysisResponse +from etsi_dq.metrics.completeness import check_completeness +from etsi_dq.metrics.accuracy import check_accuracy +from etsi_dq.metrics.consistency import check_consistency +from etsi_dq.metrics.timeliness import check_timeliness +from etsi_dq.metrics.reliability import check_reliability +from etsi_dq.metrics.uniqueness import check_uniqueness +from etsi_dq.helpers.summary_helpers import ( + build_accuracy_summary_payload, + build_completeness_summary_payload, + build_consistency_summary_payload, + build_reliability_summary_payload, + build_timeliness_summary_payload, + build_uniqueness_summary_payload, +) +from etsi_dq.helpers.detail_helpers import ( + build_accuracy_detail_payload, + build_completeness_detail_payload, + build_consistency_detail_payload, + build_reliability_detail_payload, + build_timeliness_detail_payload, + build_uniqueness_detail_payload, +) + +from etsi_dq.helpers.dashboard_helpers import ( + build_main_dashboard_payload, +) + +def run_completeness(df, config: Dict[str, Any]) -> Dict[str, Any]: + """Completeness 실행.""" + try: + result = check_completeness( + df, + required_cols=config.get('comp_required_cols') or [], + missing_text=config.get('comp_missing_text', ''), + granularity=str(config.get('comp_granularity', 'Dataset')), + ) + comp_summary_payload = build_completeness_summary_payload( + df, + list(result.used_cols), + list(result.missing_tokens), + ) + comp_detail_payload = build_completeness_detail_payload( + df, + list(result.used_cols), + list(result.missing_tokens), + ) + return { + "score": float(result.score_percent), + "details": { + "comp_status": "ok", + "comp_na_reason": result.na_reason, + "comp_used_cols": list(result.used_cols), + "comp_missing_tokens": list(result.missing_tokens), + **comp_summary_payload, + **comp_detail_payload, + "comp_granularity_used": result.granularity_used, + "comp_dataset_score": float(result.dataset_score), + "comp_row_score": float(result.row_score), + "comp_missing_cells": int(result.missing_cells), + "comp_complete_rows": int(result.complete_rows), + "comp_incomplete_rows": int(result.incomplete_rows), + "comp_col_scores": dict(result.col_scores or {}), + }, + "message": None, + } + except Exception as e: + return { + "score": None, + "details": { + "comp_status": "n/a", + "comp_na_reason": str(e), + }, + "message": f"Completeness = N/A: {e}", + } + + +def run_accuracy(df, config: Dict[str, Any]) -> Dict[str, Any]: + """Accuracy 실행.""" + try: + result = check_accuracy( + df, + acc_rules=config.get('acc_rules') or [], + config=config, + ref_df=config.get('acc_ref_df'), + ) + acc_rule_results = list(result.rule_results) + acc_summary_payload = build_accuracy_summary_payload(acc_rule_results) + acc_detail_payload = build_accuracy_detail_payload(df, acc_rule_results) + + if result.score_percent is None: + return { + "score": None, + "details": { + "acc_status": "n/a", + "acc_na_reason": result.na_reason or 'No valid accuracy rule could be evaluated.', + "acc_rule_results": acc_rule_results, + "acc_mae": result.last_mae, + "acc_rmse": result.last_rmse, + "acc_checked": int(result.total_checked), + "acc_count": int(result.total_accurate), + "inacc_count": int(result.total_inaccurate), + "acc_calib_params": result.last_calib_params, + "acc_coverage": float(result.last_coverage or 0.0), + "acc_meta": result.last_meta if isinstance(result.last_meta, dict) else {'task': 'unknown'}, + "acc_task_type": (result.last_meta.get('task', 'unknown') if isinstance(result.last_meta, dict) else 'unknown'), + "target": result.last_target, + "true": result.last_true, + "acc_ref_mode": result.last_ref_mode, + "acc_ref_value_col": result.last_ref_value_col, + "acc_meas_keys": list(result.last_meas_keys), + "acc_ref_keys": list(result.last_ref_keys), + "acc_matched_rows": int(result.last_matched_rows), + **acc_summary_payload, + **acc_detail_payload, + }, + "message": f"Accuracy = N/A: {result.na_reason or 'No valid accuracy rule could be evaluated.'}", + } + + return { + "score": float(result.score_percent), + "details": { + "acc_status": "ok", + "acc_na_reason": None, + "acc_rule_results": acc_rule_results, + "acc_mae": result.last_mae, + "acc_rmse": result.last_rmse, + "acc_checked": int(result.total_checked), + "acc_count": int(result.total_accurate), + "inacc_count": int(result.total_inaccurate), + "acc_calib_params": result.last_calib_params, + "acc_coverage": float(result.last_coverage or 0.0), + "acc_meta": result.last_meta if isinstance(result.last_meta, dict) else {'task': 'unknown'}, + "acc_task_type": (result.last_meta.get('task', 'unknown') if isinstance(result.last_meta, dict) else 'unknown'), + "target": result.last_target, + "true": result.last_true, + "acc_ref_mode": result.last_ref_mode, + "acc_ref_value_col": result.last_ref_value_col, + "acc_meas_keys": list(result.last_meas_keys), + "acc_ref_keys": list(result.last_ref_keys), + "acc_matched_rows": int(result.last_matched_rows), + **acc_summary_payload, + **acc_detail_payload, + }, + "message": None, + } + except Exception as e: + return { + "score": None, + "details": { + "acc_status": "n/a", + "acc_na_reason": str(e), + }, + "message": f"Accuracy = N/A: {e}", + } + + +def run_consistency(df, config: Dict[str, Any]) -> Dict[str, Any]: + """Consistency 실행.""" + try: + result = check_consistency( + df, + cons_rules=config.get('cons_rules') or [], + config=config, + ref_df=config.get('cons_ref_df'), + ) + cons_rule_results = list(result.rule_results) + cons_summary_payload = build_consistency_summary_payload(cons_rule_results) + cons_detail_payload = build_consistency_detail_payload(cons_rule_results) + + if result.score_percent is None: + return { + "score": None, + "details": { + "cons_status": "n/a", + "cons_na_reason": result.na_reason or 'No valid consistency rule could be evaluated.', + "cons_rule_results": cons_rule_results, + "cons_mode_used": result.last_mode_used, + "cons_rule_used": result.last_rule_used, + "cons_total_rows": int(result.last_total_rows), + "cons_violation_count": int(result.last_violation_count), + "cons_violation_sample": list(result.last_violation_sample), + "cons_regex_pattern_used": result.last_regex_pattern, + "cons_match_count": int(result.last_match_count), + "cons_match_sample": list(result.last_match_sample), + "cons_format_used": result.last_format_used, + "cons_format_col_used": result.last_format_col, + "cons_allow_empty_used": bool(result.last_allow_empty), + **cons_summary_payload, + **cons_detail_payload, + }, + "message": f"Consistency = N/A: {result.na_reason or 'No valid consistency rule could be evaluated.'}", + } + + return { + "score": float(result.score_percent), + "details": { + "cons_status": "ok", + "cons_na_reason": None, + "cons_rule_results": cons_rule_results, + "cons_mode_used": result.last_mode_used, + "cons_rule_used": result.last_rule_used, + "cons_total_rows": int(result.last_total_rows), + "cons_violation_count": int(result.last_violation_count), + "cons_violation_sample": list(result.last_violation_sample), + "cons_regex_pattern_used": result.last_regex_pattern, + "cons_match_count": int(result.last_match_count), + "cons_match_sample": list(result.last_match_sample), + "cons_format_used": result.last_format_used, + "cons_format_col_used": result.last_format_col, + "cons_allow_empty_used": bool(result.last_allow_empty), + **cons_summary_payload, + **cons_detail_payload, + }, + "message": None, + } + except Exception as e: + return { + "score": None, + "details": { + "cons_status": "n/a", + "cons_na_reason": str(e), + }, + "message": f"Consistency rule evaluation failed: {e}", + } + + +def run_timeliness(df, config: Dict[str, Any]) -> Dict[str, Any]: + """Timeliness 실행.""" + try: + local_config = config.copy() + messages = [] + + if local_config.get('t_event') in (None, '(None)'): + return { + "score": None, + "details": { + "time_status": "n/a", + "time_na_reason": "Event Time column is not selected.", + }, + "message": "Timeliness = N/A: Please select an Event Time column.", + } + + reference_mode = str(local_config.get('t_reference_mode', 'column') or 'column') + reference_time = local_config.get('t_reference_time', '') + + if reference_mode == 'column' and local_config.get('t_system') in (None, '(None)') and '__ingested_at' in df.columns: + local_config['t_system'] = '__ingested_at' + messages.append( + "Timeliness notice: System Time was not selected, so '__ingested_at' (auto-added at upload) was used as System Time." + ) + + result = check_timeliness( + df, + event_col=local_config.get('t_event'), + system_col=local_config.get('t_system'), + sla_seconds=float(local_config.get('t_sla_seconds', 0.0) or 0.0), + reference_mode=reference_mode, + reference_time=reference_time, + ) + + if result.auto_swapped: + messages.append( + "Timeliness notice: the selected Event/System columns appear reversed (all latencies were negative). Direction was automatically swapped for latency calculation." + ) + + if result.score_percent is None: + time_details = { + "t_reference_mode": result.reference_mode, + "t_reference_time": result.reference_time, + "t_event": result.event_col, + "t_system": result.system_col, + "t_event_nat": int(result.event_time_nat), + "t_system_nat": int(result.system_time_nat), + "t_total_rows": int(result.total_rows), + "t_pairs_valid": int(result.valid_pairs), + "t_negative_latency": int(result.negative_latency_pairs), + "t_auto_swapped": bool(result.auto_swapped), + "t_sla_seconds": float(result.sla_seconds), + } + time_summary_payload = build_timeliness_summary_payload( + { + "t_reference_mode": result.reference_mode, + "t_reference_time": result.reference_time, + "t_event": result.event_col, + "t_system": result.system_col, + "t_p95_latency": None, + } + ) + time_detail_payload = build_timeliness_detail_payload(df, time_details) + return { + "score": None, + "details": { + **time_summary_payload, + **time_detail_payload, + "time_status": "n/a", + "time_na_reason": result.na_reason, + **time_details, + }, + "message": " ".join(messages + [f"Timeliness = N/A: {result.na_reason}"]).strip(), + } + + time_details = { + "t_event": result.event_col, + "t_system": result.system_col, + "t_reference_mode": result.reference_mode, + "t_reference_time": result.reference_time, + "t_event_nat": int(result.event_time_nat), + "t_system_nat": int(result.system_time_nat), + "t_total_rows": int(result.total_rows), + "t_pairs_valid": int(result.valid_pairs), + "t_negative_latency": int(result.negative_latency_pairs), + "t_auto_swapped": bool(result.auto_swapped), + "t_avg_latency": float(result.avg_latency_seconds), + "t_min_latency": float(result.min_latency_seconds), + "t_max_latency": float(result.max_latency_seconds), + "t_p95_latency": float(result.p95_latency_seconds), + "t_sla_seconds": float(result.sla_seconds), + } + time_summary_payload = build_timeliness_summary_payload( + { + "t_reference_mode": result.reference_mode, + "t_reference_time": result.reference_time, + "t_event": result.event_col, + "t_system": result.system_col, + "t_p95_latency": float(result.p95_latency_seconds), + } + ) + time_detail_payload = build_timeliness_detail_payload(df, time_details) + return { + "score": float(result.score_percent), + "details": { + **time_summary_payload, + **time_detail_payload, + "time_status": "ok", + "time_na_reason": None, + **time_details, + }, + "message": " ".join(messages).strip() or None, + } + except Exception as e: + return { + "score": None, + "details": { + "time_status": "n/a", + "time_na_reason": str(e), + }, + "message": f"Timeliness = N/A: {e}", + } + + +def run_reliability(df, config: Dict[str, Any]) -> Dict[str, Any]: + """Reliability 실행.""" + try: + rel_cols = config.get('reliability_cols') or [] + thresholds_map = config.get('reliability_thresholds') or {} + + # backward compatibility + if not rel_cols: + legacy_col = config.get('reliability_col', '(None)') + if legacy_col not in (None, '(None)'): + rel_cols = [legacy_col] + + if not rel_cols: + return { + "score": None, + "details": { + "rel_status": "n/a", + "rel_na_reason": "Numeric columns are not selected.", + "rel_results": [], + }, + "message": "Reliability = N/A: Please select at least one numeric column.", + } + + rel_results = [] + valid_cvs = [] + + for rel_col in rel_cols: + series = pd.to_numeric(df[rel_col], errors='coerce').dropna() + valid_count = int(series.shape[0]) + + mean_val = float(series.mean()) if valid_count else None + std_val = float(series.std(ddof=1)) if valid_count > 1 else 0.0 if valid_count == 1 else None + + cv = check_reliability(df[rel_col]) + cv_value = None if cv is None else float(cv) + + col_thresholds = thresholds_map.get(rel_col, {}) + high_max = float(col_thresholds.get("high_max", 5.0)) + moderate_max = float(col_thresholds.get("moderate_max", 15.0)) + + rel_level = None + rel_warning = None + rel_status = "ok" + rel_na_reason = None + + if cv_value is None: + rel_status = "n/a" + rel_na_reason = "Selected column is not numeric or mean is zero." + else: + valid_cvs.append(cv_value) + + if cv_value <= high_max: + rel_level = "High Reliability" + elif cv_value <= moderate_max: + rel_level = "Moderate Reliability" + else: + rel_level = "Low Reliability" + + if mean_val is not None and abs(mean_val) < 1e-6: + rel_warning = "CV may be unstable because the mean is close to zero." + + rel_results.append( + { + "column": rel_col, + "status": rel_status, + "na_reason": rel_na_reason, + "valid_count": valid_count, + "mean": mean_val, + "std": std_val, + "cv": cv_value, + "level": rel_level, + "warning": rel_warning, + "high_max": high_max, + "moderate_max": moderate_max, + } + ) + + if not valid_cvs: + rel_detail_payload = build_reliability_detail_payload(df, rel_results, rel_cols) + return { + "score": None, + "details": { + "rel_status": "n/a", + "rel_na_reason": "No valid numeric columns could be evaluated for Reliability.", + "rel_results": rel_results, + **rel_detail_payload, + }, + "message": "Reliability = N/A: No valid numeric columns could be evaluated.", + } + rel_summary_payload = build_reliability_summary_payload(rel_results, rel_cols) + rel_detail_payload = build_reliability_detail_payload(df, rel_results, rel_cols) + + overall_score = float(sum(valid_cvs) / len(valid_cvs)) + + warning_messages = [ + f"{item['column']}: {item['warning']}" + for item in rel_results + if item.get('warning') + ] + + return { + "score": overall_score, + "details": { + "rel_status": "ok", + "rel_na_reason": None, + "rel_results": rel_results, + **rel_summary_payload, + **rel_detail_payload, + }, + "message": " | ".join(warning_messages) if warning_messages else None, + } + except Exception as e: + return { + "score": None, + "details": { + "rel_status": "n/a", + "rel_na_reason": str(e), + "rel_results": [], + }, + "message": f"Reliability = N/A: {e}", + } + + +def run_uniqueness(df, config: Dict[str, Any]) -> Dict[str, Any]: + """Uniqueness 실행.""" + try: + uniq_cols = config.get('uniq_cols') or [] + result = check_uniqueness(df, key_cols=uniq_cols if uniq_cols else None) + + uniq_summary_payload = build_uniqueness_summary_payload( + total_rows=int(result.total_rows), + duplicate_rows=int(result.duplicate_rows), + key_cols=result.key_cols, + ) + + uniq_detail_payload = build_uniqueness_detail_payload( + df, + result.key_cols, + ) + + return { + "score": float(result.uniqueness_percent), + "details": { + "uniq_status": "ok", + "uniq_na_reason": None, + "uniq_mode": "current_dataset", + "uniq_score": float(result.uniqueness_percent), + **uniq_summary_payload, + **uniq_detail_payload, + }, + "message": None, + } + + except Exception as e: + return { + "score": None, + "details": { + "uniq_status": "n/a", + "uniq_na_reason": str(e), + }, + "message": f"Uniqueness = N/A: {e}", + } + + +def run_analysis(request: AnalysisRequest) -> AnalysisResponse: + """ + 전체 분석 진입점. + dashboard2.py에서는 나중에 이 함수만 호출하게 만들 예정. + """ + df = request.df + selected_metrics = request.selected_metrics + config = request.config.copy() + + response = AnalysisResponse( + results={}, + config=config, + messages=[] + ) + + metric_runner_map = { + "Completeness": run_completeness, + "Accuracy": run_accuracy, + "Consistency": run_consistency, + "Timeliness": run_timeliness, + "Reliability": run_reliability, + "Uniqueness": run_uniqueness, + } + + for metric in selected_metrics: + runner = metric_runner_map.get(metric) + if runner is None: + response.messages.append(f"Unknown metric: {metric}") + continue + + try: + metric_result = runner(df, response.config) + response.results[metric] = metric_result.get("score") + details = metric_result.get("details", {}) + if isinstance(details, dict): + response.config.update(details) + + if metric_result.get("message"): + response.messages.append(f"{metric}: {metric_result['message']}") + except Exception as e: + response.results[metric] = None + response.messages.append(f"{metric} execution error: {e}") + + try: + dashboard_payload = build_main_dashboard_payload( + df=df, + scores=response.results, + details=response.config, + ) + response.config.update(dashboard_payload) + except Exception as e: + response.messages.append(f"Main dashboard payload build error: {e}") + + return response \ No newline at end of file diff --git a/etsi_dq/profiler.py b/etsi_dq/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..d930eef83a21e775ab8185762dccfa1f6ac45f63 --- /dev/null +++ b/etsi_dq/profiler.py @@ -0,0 +1,19 @@ +import pandas as pd +from pandas.api.types import is_numeric_dtype + +def get_metadata(df): + """ + 데이터프레임의 컬럼 정보를 자동으로 분석해서 딕셔너리로 반환 + """ + metadata = {} + + for col in df.columns: + # 해당 컬럼이 숫자(정수/실수)인지 확인 + is_numeric = is_numeric_dtype(df[col]) + + metadata[col] = { + "type": "numeric" if is_numeric else "string", + "sample": df[col].iloc[0] # 첫 번째 값을 샘플로 저장 + } + + return metadata diff --git a/etsi_dq/report.py b/etsi_dq/report.py new file mode 100644 index 0000000000000000000000000000000000000000..590bf78af77e73dfe3cc11767c978f9582565ffd --- /dev/null +++ b/etsi_dq/report.py @@ -0,0 +1,152 @@ + + +import time +from datetime import datetime, timedelta + +import pandas as pd +import streamlit as st + + +def render_reports_tab(df, avg_score: float): + """Render the Reports tab UI.""" + st.markdown("### Data Quality Reports") + st.caption("Generate, customize, and manage your quality assessment reports.") + + rep_c1, rep_c2 = st.columns([1, 2.2], gap="large") + + # --- [Left: Create New Report] --- + with rep_c1: + with st.container(border=True): + st.subheader("Create New Report") + + rep_type = st.radio( + "Report Type", + ["Executive Summary", "Detailed Analysis", "Compliance Report", "Trend Analysis"], + index=0, + ) + + st.markdown("---") + st.markdown("**Report Configuration**") + + with st.expander("Select Metrics to Include", expanded=True): + m_cols = st.columns(2) + inc_comp = m_cols[0].checkbox("Completeness", True, key="r_inc_c") + inc_acc = m_cols[0].checkbox("Accuracy", True, key="r_inc_a") + inc_cons = m_cols[1].checkbox("Consistency", True, key="r_inc_con") + inc_time = m_cols[1].checkbox("Timeliness", False, key="r_inc_t") + + st.markdown("**Date Range**") + dr_c1, dr_c2 = st.columns(2) + date_from = dr_c1.date_input("From", datetime.now() - timedelta(days=30), key="d_from") + date_to = dr_c2.date_input("To", datetime.now(), key="d_to") + + st.checkbox("Send email notification upon completion", value=True) + + st.markdown("
", unsafe_allow_html=True) + if st.button("Generate Report", type="primary", use_container_width=True): + if not st.session_state.get("results"): + st.error("Please run analysis first.") + else: + with st.spinner("Compiling data and generating report..."): + time.sleep(1.2) + res = st.session_state["results"] + + if rep_type == "Detailed Analysis": + m_df = pd.DataFrame( + [{"Metric": k, "Value": f"{v:.2f}"} for k, v in res.items()] + ) + stats_df = df.describe().reset_index().rename(columns={"index": "Stat_Metric"}) + report_df = pd.concat( + [ + m_df, + pd.DataFrame([ + {"Metric": "--- STATS ---", "Value": "---"} + ]), + stats_df, + ], + ignore_index=True, + ) + elif rep_type == "Executive Summary": + report_df = pd.DataFrame( + [ + {"Category": "Overview", "Metric": "Dataset", "Value": st.session_state["filename"]}, + {"Category": "Overview", "Metric": "Period", "Value": f"{date_from} ~ {date_to}"}, + {"Category": "Score", "Metric": "Overall", "Value": f"{avg_score:.1f}%"}, + { + "Category": "Score", + "Metric": "Accuracy", + "Value": f"{res.get('Accuracy', 0):.2f}%" if inc_acc else "N/A", + }, + ] + ) + else: + report_df = pd.DataFrame([ + {"Metric": "Status", "Value": "Verified"} + ]) + + new_id = f"RPT-{10046 + len(st.session_state['report_history'])}" + new_row = { + "Report ID": new_id, + "Report Name": f"{rep_type}_Q4_2025", + "Type": rep_type, + "Date Generated": datetime.now().strftime("%Y-%m-%d"), + "Status": "Completed", + "SavedData": report_df, + } + st.session_state["report_history"] = pd.concat( + [pd.DataFrame([new_row]), st.session_state["report_history"]], + ignore_index=True, + ) + st.success(f"Report {new_id} generated!") + st.rerun() + + # --- [Right: Recent Reports & Management] --- + with rep_c2: + with st.container(border=True): + st.subheader("Recent Reports & Management") + st.text_input( + "Search", + placeholder="Search reports by name or ID...", + label_visibility="collapsed", + ) + + history_df = st.session_state["report_history"] + + if not history_df.empty: + selected_rpt_id = st.selectbox("Select Report to Manage", history_df["Report ID"]) + sel_idx = history_df[history_df["Report ID"] == selected_rpt_id].index[0] + target_rpt = history_df.loc[sel_idx] + + act_col1, act_col2, act_col3 = st.columns([1, 1, 1]) + + csv_data = target_rpt["SavedData"].to_csv(index=False).encode("utf-8-sig") + act_col1.download_button( + "📥 Download CSV", + csv_data, + f"{target_rpt['Report ID']}.csv", + "text/csv", + use_container_width=True, + ) + + if act_col2.button("👁️ Preview Data", use_container_width=True): + st.write(f"### Preview: {selected_rpt_id}") + st.dataframe(target_rpt["SavedData"], use_container_width=True) + + if act_col3.button("🗑️ Delete Report", use_container_width=True): + st.session_state["report_history"] = history_df.drop(sel_idx).reset_index(drop=True) + st.rerun() + + st.markdown("---") + + display_df = history_df.drop(columns=["SavedData"]).copy() + st.dataframe( + display_df, + column_config={ + "Status": st.column_config.TextColumn("Status", help="Completed / Processing / Failed"), + "Report ID": st.column_config.TextColumn("ID", width="small"), + }, + use_container_width=True, + hide_index=True, + ) + else: + st.info("No reports found. Generate a new report on the left.") \ No newline at end of file diff --git a/etsi_dq/schemas.py b/etsi_dq/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..80aa1b8841856e71ed6548fe7690ac5addf2bb94 --- /dev/null +++ b/etsi_dq/schemas.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass, field +from typing import Any, Dict, List +import pandas as pd + +@dataclass +class AnalysisRequest: + df: pd.DataFrame + selected_metrics: List[str] + config: Dict[str, Any] + +@dataclass +class AnalysisResponse: + results: Dict[str, Any] = field(default_factory=dict) + config: Dict[str, Any] = field(default_factory=dict) + messages: List[str] = field(default_factory=list) \ No newline at end of file diff --git a/etsi_dq/ui/precheck.py b/etsi_dq/ui/precheck.py new file mode 100644 index 0000000000000000000000000000000000000000..0412e94f4905ac2ecd3e59a36fcc947ac77d5d3f --- /dev/null +++ b/etsi_dq/ui/precheck.py @@ -0,0 +1,146 @@ +import numpy as np +import pandas as pd +import streamlit as st + + +# --------------------------------------------------------- +# [Pre-check helper] +# --------------------------------------------------------- + +def build_metric_precheck(df: pd.DataFrame) -> dict: + """Return a quick readiness summary for each metric based on dataset columns. + + This is UI-only guidance to reduce confusion about N/A and required inputs. + """ + cols = [str(c) for c in (df.columns.tolist() if df is not None else [])] + lower_cols = [c.lower() for c in cols] + + def has_numeric() -> bool: + try: + return len(df.select_dtypes(include=[np.number]).columns) > 0 + except Exception: + return False + + def likely_gt_cols() -> list[str]: + keywords = ["gt", "ground", "truth", "label", "target", "actual", "y_true", "reference", "ref"] + hits = [] + for c, lc in zip(cols, lower_cols): + if any(k in lc for k in keywords): + hits.append(c) + return hits + + def likely_time_cols() -> list[str]: + keywords = ["time", "date", "timestamp", "ts", "event", "created", "approved", "received", "ingest", "ingested"] + hits = [] + for c, lc in zip(cols, lower_cols): + if any(k in lc for k in keywords): + hits.append(c) + return hits + + gt_hits = likely_gt_cols() + time_hits = likely_time_cols() + + summary = {} + + summary["Completeness"] = { + "status": "satisfied", + "note": "Required input is satisfied. Completeness can be evaluated for the uploaded dataset." + } + summary["Uniqueness"] = { + "status": "satisfied", + "note": "Required input is satisfied. Key columns can be selected for a more meaningful evaluation." + } + + summary["Consistency"] = { + "status": "config_required", + "note": "A validation rule, format rule, or referential integrity rule must be configured." + } + + if has_numeric(): + summary["Reliability"] = { + "status": "satisfied", + "note": "A numeric column is available for reliability evaluation." + } + else: + summary["Reliability"] = { + "status": "missing_required_input", + "note": "No numeric column is available. Reliability cannot be evaluated." + } + + if len(gt_hits) > 0: + summary["Accuracy"] = { + "status": "config_required", + "note": "Measured value, method, and reference settings must be configured before evaluation." + } + else: + summary["Accuracy"] = { + "status": "missing_required_input", + "note": "No reference or ground-truth column is available in the current dataset." + } + + if len(time_hits) > 0: + summary["Timeliness"] = { + "status": "config_required", + "note": "Event time, system time, and SLA threshold must be configured before evaluation." + } + else: + summary["Timeliness"] = { + "status": "missing_required_input", + "note": "No valid event-time column is detected in the current dataset." + } + + return summary + + +# --------------------------------------------------------- +# [Pre-check rendering helpers] +# --------------------------------------------------------- +PRECHECK_METRIC_ORDER = [ + "Completeness", + "Uniqueness", + "Reliability", + "Consistency", + "Timeliness", + "Accuracy", +] + + +def _badge(status: str) -> str: + if status == "satisfied": + return "✅ Satisfied" + if status == "config_required": + return "⚠️ Configuration Required" + if status == "missing_required_input": + return "❗ Missing Required Input" + return "❗ Missing Required Input" + + + +def _get_precheck_rows(precheck: dict) -> list[tuple[str, dict]]: + """Return ordered pre-check rows for rendering.""" + rows = [] + for metric in PRECHECK_METRIC_ORDER: + info = precheck.get(metric, {"status": "config_required", "note": ""}) + rows.append((metric, info)) + return rows + + + +def render_precheck_section(df: pd.DataFrame, *, show_title: bool = True, show_tip: bool = True): + """Render the pre-check summary UI for the currently selected dataset.""" + if show_title: + st.subheader("Pre-check (Requirement Readiness by Metric)") + + precheck = build_metric_precheck(df) + rows = _get_precheck_rows(precheck) + + with st.container(border=True): + for metric, info in rows: + st.markdown(f"**{_badge(info['status'])} — {metric}** ") + if info.get("note"): + st.caption(info["note"]) + + if show_tip: + st.info( + "Tip: This pre-check shows whether each metric's required input is satisfied, still needs configuration, or is missing from the current dataset." + ) diff --git a/etsi_dq/ui/selectors.py b/etsi_dq/ui/selectors.py new file mode 100644 index 0000000000000000000000000000000000000000..82834406e530598513378eff12784d30962c1790 --- /dev/null +++ b/etsi_dq/ui/selectors.py @@ -0,0 +1,31 @@ +import streamlit as st + + +METRIC_OPTIONS = [ + 'Completeness', + 'Reliability', + 'Uniqueness', + 'Accuracy', + 'Consistency', + 'Timeliness', +] + + + +def render_metric_selector() -> list[str]: + """Render the metric checkbox grid and return selected metrics.""" + st.markdown("**Select Metrics**") + + selected_metrics: list[str] = [] + metric_columns = st.columns(3) + + for index, metric in enumerate(METRIC_OPTIONS): + column_index = index % 3 + with metric_columns[column_index]: + if st.checkbox(metric, value=False, key=f"check_{metric}"): + selected_metrics.append(metric) + + if not selected_metrics: + st.info("👆 Please check at least one metric above to configure settings.") + + return selected_metrics \ No newline at end of file diff --git a/etsi_dq/ui/settings.py b/etsi_dq/ui/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..4937c83f2c06cf2cf9c29178cda2e480f8a61893 --- /dev/null +++ b/etsi_dq/ui/settings.py @@ -0,0 +1,222 @@ +import numpy as np +import pandas as pd + +import streamlit as st + + + +def render_basic_metric_settings(df: pd.DataFrame, selected_metrics: list[str]) -> dict: + """Render basic metric settings for Completeness, Reliability, and Uniqueness.""" + config: dict = {} + cols = df.columns.tolist() + + if 'Completeness' in selected_metrics: + st.markdown("#### Completeness Settings") + + config['comp_required_cols'] = st.multiselect( + "Required columns (leave empty to use ALL columns)", + options=cols, + default=[], + help=( + "Completeness should be evaluated against required attributes (Quality Context). " + "If empty, the dashboard uses ALL columns." + ) + ) + + default_missing = "?, N/A, NA, null, None, -200, -999, 9999" + config['comp_missing_text'] = st.text_input( + "Missing value representation (comma-separated)", + value=default_missing, + help=( + "Values treated as missing in addition to NaN/empty. " + "Examples: ?, N/A, -200, 9999" + ) + ) + + config['comp_granularity'] = st.selectbox( + "Granularity (main completeness score)", + ["Dataset", "Row", "Column"], + index=0, + help=( + "Dataset: overall missing-cell rate. " + "Row: % of complete rows. " + "Column: average completeness across columns." + ) + ) + st.caption("Completeness uses your Required Columns + Missing Value definitions. Empty strings are treated as missing for text columns.") + st.divider() + + if 'Reliability' in selected_metrics: + st.markdown("#### Reliability Settings") + + num_cols = df.select_dtypes(include=[np.number]).columns.tolist() + + config['reliability_cols'] = st.multiselect( + "Numeric Columns for Reliability (CV%)", + options=num_cols, + default=[], + help=( + "Select one or more numeric columns to evaluate Reliability. " + "Reliability is computed using the coefficient of variation (std/mean)*100." + ) + ) + + config['reliability_thresholds'] = {} + + if config['reliability_cols']: + st.caption( + "For each selected column, you can optionally adjust the interpretation thresholds. " + "If you keep the default values, the system uses the default general-purpose criteria." + ) + + for i, col in enumerate(config['reliability_cols']): + with st.expander(f"Reliability Rule: {col}", expanded=(i == 0)): + r1, r2 = st.columns(2) + + high_max = r1.number_input( + f"{col} - High Reliability Threshold (CV ≤ %)", + min_value=0.0, + value=5.0, + step=0.1, + key=f"rel_high_max_{col}", + help="If CV is less than or equal to this value, the column is interpreted as High Reliability." + ) + + moderate_max = r2.number_input( + f"{col} - Moderate Reliability Threshold (CV ≤ %)", + min_value=0.0, + value=15.0, + step=0.1, + key=f"rel_moderate_max_{col}", + help="If CV is greater than the High threshold and less than or equal to this value, the column is interpreted as Moderate Reliability." + ) + + config['reliability_thresholds'][col] = { + "high_max": float(high_max), + "moderate_max": float(moderate_max), + } + + if moderate_max < high_max: + st.warning("Moderate threshold should be greater than or equal to the High threshold.") + + else: + st.info("Select at least one numeric column to evaluate Reliability.") + + st.caption( + "Reliability requires numeric columns. The system interprets CV using " + "High Reliability / Moderate Reliability / Low Reliability bands." + ) + st.divider() + + if 'Uniqueness' in selected_metrics: + st.markdown("#### Uniqueness Settings") + + config['uniq_cols'] = st.multiselect( + "Key Columns for Uniqueness", + options=cols, + default=[], + help=( + "Select one or more columns to use as the basis for duplicate detection. " + "If no columns are selected, the dashboard uses full-row duplication across all columns." + ) + ) + + if config['uniq_cols']: + st.caption( + "The selected columns will be used as uniqueness keys. " + "Rows with the same values in these columns will be treated as duplicates." + ) + else: + st.info( + "No key columns selected. The dashboard will evaluate Uniqueness using all columns." + ) + + st.divider() + + return config + + +TIME_NA_TITLE = "Timeliness: Not Available (N/A)" +TIME_NA_BODY = ( + "Timeliness requires an Event Time column and either a System/Ingestion Time column or a user-defined Reference Time. " + "If the required input for the selected mode is missing, Timeliness must be reported as N/A." +) +TIME_NA_CTA = "To enable Timeliness, select an Event Time column and configure either System Time mode or User-defined Reference Time mode." + + + +def render_timeliness_settings(df: pd.DataFrame, selected_metrics: list[str]) -> dict: + """Render timeliness settings and return timeliness-related config.""" + config: dict = {} + cols = df.columns.tolist() + + if 'Timeliness' in selected_metrics: + st.markdown("#### Timeliness Settings") + + cols_with_none = ["(None)"] + cols + def_event = next( + ( + c for c in cols + if 'event' in c.lower() or 'gen' in c.lower() or 'timestamp' in c.lower() or 'time' in c.lower() + ), + "(None)", + ) + + config['t_reference_mode'] = st.radio( + "Time Reference Mode", + ["System Time Column", "User-defined Reference Time"], + horizontal=True, + help="Choose whether Timeliness is calculated against a System/Ingestion Time column in the dataset or against a user-defined reference time." + ) + + mode_map = { + "System Time Column": "column", + "User-defined Reference Time": "fixed_time", + } + config['t_reference_mode'] = mode_map[config['t_reference_mode']] + + event_index = cols_with_none.index(def_event) if def_event in cols_with_none else 0 + config['t_event'] = st.selectbox( + "Event Time Column (Generation)", + cols_with_none, + index=event_index, + ) + + if config['t_reference_mode'] == 'column': + def_sys = "(None)" + sys_index = cols_with_none.index(def_sys) if def_sys in cols_with_none else 0 + config['t_system'] = st.selectbox( + "System Time Column (Ingestion)", + cols_with_none, + index=sys_index, + ) + config['t_reference_time'] = "" + else: + config['t_reference_time'] = st.text_input( + "Reference Time", + value="", + placeholder="e.g., 2026-04-14 12:00:00", + help="Enter the reference time to compare against the selected Event Time column." + ) + config['t_system'] = "(None)" + + config['t_sla_seconds'] = st.number_input( + "Max Allowed Latency (seconds)", + min_value=0.0, + value=1.0, + step=0.1, + help="Timeliness score = % of records with latency <= this threshold.", + ) + + missing_required_input = ( + config.get('t_event') in (None, '(None)') or ( + config['t_reference_mode'] == 'column' and config.get('t_system') in (None, '(None)') + ) or ( + config['t_reference_mode'] == 'fixed_time' and not str(config.get('t_reference_time') or '').strip() + ) + ) + + if missing_required_input: + st.info(f"**{TIME_NA_TITLE}**\n\n{TIME_NA_BODY}\n\n{TIME_NA_CTA}") + + return config \ No newline at end of file diff --git a/etsi_dq/ui/settings_advanced.py b/etsi_dq/ui/settings_advanced.py new file mode 100644 index 0000000000000000000000000000000000000000..2412f10a03f313ca0af11ca35a17501ac8262092 --- /dev/null +++ b/etsi_dq/ui/settings_advanced.py @@ -0,0 +1,426 @@ +import numpy as np +import pandas as pd +import streamlit as st + +from etsi_dq.utils import load_reference_dataset + + + +def render_accuracy_settings(df: pd.DataFrame, selected_metrics: list[str], config: dict) -> dict: + """Render advanced accuracy settings and return updated config.""" + cols = df.columns.tolist() + + if 'Accuracy' in selected_metrics: + st.markdown("#### Accuracy Settings") + st.caption("Define one or more column-specific accuracy rules. Each rule can use a different method.") + + cols_with_none = ["(None)"] + cols + num_cols = df.select_dtypes(include=[np.number]).columns.tolist() + + acc_rule_count = st.number_input( + "Number of Accuracy Rules", + min_value=1, + value=int(config.get('acc_rule_count', 1) or 1), + step=1, + help="Add one rule per measured column or per evaluation need." + ) + config['acc_rule_count'] = int(acc_rule_count) + + with st.expander("Accuracy External Reference Dataset (shared across accuracy rules)", expanded=False): + ref_file = st.file_uploader( + "Upload Reference (Ground Truth) file (CSV/XLSX/ZIP)", + type=['csv', 'xlsx', 'xls', 'zip'], + accept_multiple_files=False, + key='acc_ref_file_uploader_rules' + ) + st.session_state['acc_ref_file'] = ref_file + + ref_df = None + ref_load_err = None + if ref_file is not None: + try: + ref_df = load_reference_dataset(ref_file) + st.session_state['acc_ref_df'] = ref_df + except Exception as e: + ref_load_err = str(e) + st.session_state['acc_ref_df'] = None + else: + st.session_state['acc_ref_df'] = None + + if ref_load_err: + st.error(f"Reference file load error: {ref_load_err}") + elif ref_df is not None: + st.dataframe(ref_df.head()) + st.caption(f"Reference Rows: {len(ref_df):,} | Reference Columns: {len(ref_df.columns)}") + else: + st.info("Upload a reference file only if one or more accuracy rules use 'External Reference File'.") + + ref_df_for_rules = st.session_state.get('acc_ref_df') + ref_cols_for_rules = ref_df_for_rules.columns.tolist() if ref_df_for_rules is not None else [] + ref_cols_with_none = ["(None)"] + ref_cols_for_rules + + acc_rules = [] + for i in range(int(acc_rule_count)): + with st.expander(f"Accuracy Rule {i+1}", expanded=(i == 0)): + r_col1, r_col2 = st.columns(2) + rule_column = r_col1.selectbox( + "Measured Value Column", + cols_with_none, + index=0, + key=f"acc_rule_{i}_column", + help="Select the measured VALUE column whose accuracy will be evaluated. This is the actual data value to compare, not a join key." + ) + rule_method = r_col2.selectbox( + "Method", + ["Reference Mapping", "Threshold (Min/Max)", "Statistical Bounds (k-sigma)"], + index=0, + key=f"acc_rule_{i}_method", + help="Choose the accuracy calculation method for this column." + ) + + rule_cfg = { + 'column': rule_column, + 'method': rule_method, + 'ref_mode': 'Column in Dataset', + 'reference_column': '(None)', + 'reference_value_col': '(None)', + 'meas_keys': [], + 'ref_keys': [], + 'mode': 'Direct Compare', + 'tol_type': 'absolute', + 'tol_value': 0.0, + 'thr_min': 0.0, + 'thr_max': 1.0, + 'thr_inclusive': True, + 'sigma_k': 3.0, + 'sigma_center': 'mean', + } + + if rule_method == "Reference Mapping": + ref_mode = st.selectbox( + "Reference Source Type", + ["Column in Dataset", "External Reference File"], + index=0, + key=f"acc_rule_{i}_ref_mode", + help="Use a column in the same dataset or match against an external reference dataset." + ) + rule_cfg['ref_mode'] = ref_mode + + if ref_mode == "Column in Dataset": + rule_cfg['reference_column'] = st.selectbox( + "Reference Value Column", + cols_with_none, + index=0, + key=f"acc_rule_{i}_reference_column", + help="Select the REFERENCE VALUE (ground truth) column to compare against the measured value column." + ) + else: + mk_col, rk_col = st.columns(2) + rule_cfg['meas_keys'] = mk_col.multiselect( + "Measured dataset JOIN key column(s)", + options=cols, + default=[], + key=f"acc_rule_{i}_meas_keys", + help="Select one or more JOIN key columns in the current dataset. These are used only to match rows between datasets, not as the measured value itself." + ) + rule_cfg['ref_keys'] = rk_col.multiselect( + "Reference dataset JOIN key column(s)", + options=ref_cols_for_rules, + default=[], + key=f"acc_rule_{i}_ref_keys", + help="Select the matching JOIN key columns in the reference dataset. These are used only for row matching." + ) + rule_cfg['reference_value_col'] = st.selectbox( + "Reference Value Column", + ref_cols_with_none, + index=0, + key=f"acc_rule_{i}_ref_value_col", + help="Select the REFERENCE VALUE column from the uploaded reference dataset. This is the ground-truth value column that will be compared against the measured value column after row matching by JOIN keys." + ) + st.caption("How this works: Measured Value Column ↔ Reference Value Column are compared after rows are matched using the JOIN key columns.") + + elif rule_method == "Threshold (Min/Max)": + t1, t2, t3 = st.columns(3) + rule_cfg['thr_min'] = float(t1.number_input( + "Min", + value=0.0, + step=0.1, + key=f"acc_rule_{i}_thr_min", + )) + rule_cfg['thr_max'] = float(t2.number_input( + "Max", + value=1.0, + step=0.1, + key=f"acc_rule_{i}_thr_max", + )) + rule_cfg['thr_inclusive'] = bool(t3.checkbox( + "Inclusive", + value=True, + key=f"acc_rule_{i}_thr_inclusive", + )) + st.caption("This rule evaluates whether measured values stay inside the configured min/max range.") + + else: + s1, s2 = st.columns(2) + rule_cfg['sigma_k'] = float(s1.number_input( + "k (sigma multiplier)", + min_value=0.5, + value=3.0, + step=0.5, + key=f"acc_rule_{i}_sigma_k", + )) + rule_cfg['sigma_center'] = s2.selectbox( + "Center", + ["mean", "median"], + index=0, + key=f"acc_rule_{i}_sigma_center", + ) + st.caption("This rule evaluates whether measured values fall inside center ± k×std.") + + acc_rules.append(rule_cfg) + + config['acc_rules'] = acc_rules + + if acc_rules: + first_acc = acc_rules[0] + config['target'] = first_acc.get('column') + config['acc_method'] = first_acc.get('method', 'Reference Mapping') + config['acc_ref_mode'] = first_acc.get('ref_mode', 'Column in Dataset') + config['true'] = first_acc.get('reference_column', '(None)') + config['acc_ref_value_col'] = first_acc.get('reference_value_col', '(None)') + config['acc_meas_keys'] = first_acc.get('meas_keys') or [] + config['acc_ref_keys'] = first_acc.get('ref_keys') or [] + config['acc_mode'] = first_acc.get('mode', 'Direct Compare') + config['acc_tol_type'] = first_acc.get('tol_type', 'absolute') + config['acc_tol_value'] = float(first_acc.get('tol_value', 0.0) or 0.0) + config['acc_thr_min'] = float(first_acc.get('thr_min', 0.0) or 0.0) + config['acc_thr_max'] = float(first_acc.get('thr_max', 1.0) or 1.0) + config['acc_thr_inclusive'] = bool(first_acc.get('thr_inclusive', True)) + config['acc_sigma_k'] = float(first_acc.get('sigma_k', 3.0) or 3.0) + config['acc_sigma_center'] = str(first_acc.get('sigma_center', 'mean')) + + st.divider() + + return config + + +def render_consistency_settings(df: pd.DataFrame, selected_metrics: list[str], config: dict) -> dict: + """Render advanced consistency settings and return updated config.""" + cols = df.columns.tolist() + + if 'Consistency' in selected_metrics: + st.markdown("#### Consistency Settings") + st.caption("Define one or more column-specific consistency rules. Each rule can use a different mode.") + + cols_with_none = ["(None)"] + cols + + cons_rule_count = st.number_input( + "Number of Consistency Rules", + min_value=1, + value=int(config.get('cons_rule_count', 1) or 1), + step=1, + help="Add one rule per column or per validation condition." + ) + config['cons_rule_count'] = int(cons_rule_count) + + with st.expander("Consistency Referential Integrity Reference Dataset (shared)", expanded=False): + ref_file = st.file_uploader( + "Upload Reference Dataset (CSV/XLSX/ZIP)", + type=['csv', 'xlsx', 'xls', 'zip'], + accept_multiple_files=False, + key='cons_ref_file_uploader_rules' + ) + st.session_state['cons_ref_file'] = ref_file + + ref_df = None + ref_load_err = None + if ref_file is not None: + try: + ref_df = load_reference_dataset(ref_file) + st.session_state['cons_ref_df'] = ref_df + except Exception as e: + ref_load_err = str(e) + st.session_state['cons_ref_df'] = None + else: + st.session_state['cons_ref_df'] = None + + if ref_load_err: + st.error(f"Reference file load error: {ref_load_err}") + elif ref_df is not None: + st.dataframe(ref_df.head()) + st.caption(f"Reference Rows: {len(ref_df):,} | Reference Columns: {len(ref_df.columns)}") + else: + st.info("Upload a reference dataset only if one or more consistency rules use Referential Integrity.") + + cons_ref_df = st.session_state.get('cons_ref_df') + cons_ref_cols = cons_ref_df.columns.tolist() if cons_ref_df is not None else [] + cons_ref_cols_with_none = ['(None)'] + cons_ref_cols + + cons_rules = [] + for i in range(int(cons_rule_count)): + with st.expander(f"Consistency Rule {i+1}", expanded=(i == 0)): + c1, c2 = st.columns(2) + cons_mode_ui = c1.selectbox( + "Consistency Mode", + ["Expression", "Rule Builder", "Format / Regex", "Referential Integrity"], + index=0, + key=f"cons_rule_{i}_mode_ui", + help="Choose the consistency validation mode for this rule." + ) + target_col = c2.selectbox( + "Target Column (optional for expression-based rules)", + cols_with_none, + index=0, + key=f"cons_rule_{i}_target_col", + help="Set the main target column for this rule when applicable." + ) + + rule_cfg = { + 'mode': 'Format preset' if cons_mode_ui == 'Format / Regex' else cons_mode_ui, + 'column': target_col, + 'preset': 'email', + 'custom_regex': '', + 'allow_empty': True, + 'source_column': '(None)', + 'reference_column': '(None)', + 'ref_allow_empty': True, + 'rule': '', + } + + if cons_mode_ui == 'Expression': + helper_col = st.selectbox( + "Column helper (optional)", + cols_with_none, + index=0, + key=f"cons_rule_{i}_helper_col", + ) + default_rule = '' + if helper_col not in (None, '(None)'): + if pd.api.types.is_numeric_dtype(df[helper_col]): + default_rule = f"`{helper_col}` >= 0" + else: + default_rule = f"`{helper_col}` == `{helper_col}`" + rule_cfg['rule'] = st.text_input( + "Validation Rule (Python Expression)", + value=default_rule, + placeholder="e.g., `Speed` < 120 and `Pressure` > 0", + key=f"cons_rule_{i}_expr_rule", + help="Use backticks for column names." + ) + + elif cons_mode_ui == 'Rule Builder': + rb1, rb2 = st.columns(2) + left_col = rb1.selectbox( + "Left Column", + cols_with_none, + index=0, + key=f"cons_rule_{i}_left_col", + ) + op = rb2.selectbox( + "Operator", + ["==", "!=", ">", ">=", "<", "<="], + index=0, + key=f"cons_rule_{i}_op", + ) + rhs1, rhs2 = st.columns(2) + rhs_mode = rhs1.radio( + "Compare Against", + ["Constant Value", "Another Column"], + horizontal=True, + key=f"cons_rule_{i}_rhs_mode", + ) + if rhs_mode == "Another Column": + right_col = rhs2.selectbox( + "Right Column", + cols_with_none, + index=0, + key=f"cons_rule_{i}_right_col", + ) + if left_col not in (None, '(None)') and right_col not in (None, '(None)'): + rule_cfg['rule'] = f"`{left_col}` {op} `{right_col}`" + else: + const_val = rhs2.text_input( + "Constant Value", + value='', + placeholder="e.g., 0, 100, OK", + key=f"cons_rule_{i}_const_val", + ) + if left_col not in (None, '(None)') and str(const_val).strip() != '': + if pd.api.types.is_numeric_dtype(df[left_col]): + rule_cfg['rule'] = f"`{left_col}` {op} {str(const_val).strip()}" + else: + rule_cfg['rule'] = f"`{left_col}` {op} {repr(str(const_val).strip())}" + if rule_cfg['rule']: + st.code(rule_cfg['rule'], language='python') + + elif cons_mode_ui == 'Format / Regex': + if rule_cfg['column'] in (None, '(None)'): + st.info("Select a target column for this format/regex rule.") + format_label_map = { + "Email": "email", + "Date": "date", + "Numeric Only": "numeric_only", + "Letters Only": "letters_only", + "Alphanumeric": "alphanumeric", + "Slash Format": "slash_format", + "Custom Regex": "custom_regex", + } + format_label = st.selectbox( + "Preset", + list(format_label_map.keys()), + index=0, + key=f"cons_rule_{i}_preset_label", + ) + rule_cfg['preset'] = format_label_map[format_label] + if rule_cfg['preset'] == 'custom_regex': + rule_cfg['custom_regex'] = st.text_input( + "Custom Regex Pattern", + value='', + placeholder=r"e.g., ^[A-Za-z0-9_/.-]+$", + key=f"cons_rule_{i}_custom_regex", + ) + rule_cfg['allow_empty'] = bool(st.checkbox( + "Allow empty values (treat empty as PASS)", + value=True, + key=f"cons_rule_{i}_allow_empty", + )) + + else: + rule_cfg['source_column'] = st.selectbox( + "Source Key Column", + cols_with_none, + index=0, + key=f"cons_rule_{i}_source_col", + ) + rule_cfg['reference_column'] = st.selectbox( + "Reference Key Column", + cons_ref_cols_with_none, + index=0, + key=f"cons_rule_{i}_ref_col", + ) + rule_cfg['ref_allow_empty'] = bool(st.checkbox( + "Allow empty values: enabled means empty/null values are treated as PASS", + value=True, + key=f"cons_rule_{i}_allow_empty", + )) + st.caption("If enabled, empty/null values are counted as valid records for this rule instead of violations.") + + cons_rules.append(rule_cfg) + + config['cons_rules'] = cons_rules + + if cons_rules: + first_cons = cons_rules[0] + config['cons_mode'] = first_cons.get('mode', 'Expression') + config['cons_format_col'] = first_cons.get('column', '(None)') + config['cons_format_preset'] = first_cons.get('preset', 'email') + config['cons_custom_regex'] = first_cons.get('custom_regex', '') + config['cons_allow_empty'] = bool(first_cons.get('allow_empty', True)) + config['cons_ref_src_col'] = first_cons.get('source_column', '(None)') + config['cons_ref_ref_col'] = first_cons.get('reference_column', '(None)') + config['cons_ref_allow_empty'] = bool(first_cons.get('ref_allow_empty', True)) + config['rule'] = first_cons.get('rule', '') + + st.divider() + + return config diff --git a/etsi_dq/utils.py b/etsi_dq/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..55b62175308dfc0204a86b6e5a8e1e9e3d400c22 --- /dev/null +++ b/etsi_dq/utils.py @@ -0,0 +1,531 @@ +from __future__ import annotations + +import io +import re +import zipfile +from pathlib import Path + +import numpy as np +import pandas as pd + + +# --------------------------------------------------------- +# [Dataset Loading / Cleaning Helpers] +# --------------------------------------------------------- + +# Common missing-value sentinel codes seen in public datasets (e.g., UCI Air Quality uses -200). +SENTINEL_MISSING_VALUES = {-200, -999, -9999, 999, 9999, 99999} + + +def _read_csv_with_fallbacks(file_like) -> pd.DataFrame: + """Read CSV with encoding and delimiter fallbacks.""" + encodings = ["utf-8", "utf-8-sig", "cp949", "euc-kr", "latin1"] + last_err = None + for enc in encodings: + try: + file_like.seek(0) + return pd.read_csv(file_like, encoding=enc) + except Exception as e: + last_err = e + try: + file_like.seek(0) + return pd.read_csv(file_like, engine="python", sep=None) + except Exception as e: + raise last_err or e + + +def clean_dataset(df: pd.DataFrame) -> pd.DataFrame: + """Clean dataset in a dataset-agnostic way.""" + df2 = df.copy() + df2 = df2.loc[:, ~df2.columns.astype(str).str.contains(r"^Unnamed", regex=True)] + df2 = df2.dropna(axis=1, how="all") + + num_cols = df2.select_dtypes(include=[np.number]).columns + if len(num_cols) > 0: + df2.loc[:, num_cols] = df2.loc[:, num_cols].replace(list(SENTINEL_MISSING_VALUES), np.nan) + + if "__ingested_at" not in df2.columns: + df2["__ingested_at"] = pd.Timestamp.now(tz="UTC") + + return df2 + + +def load_any_dataset(uploaded_file) -> dict[str, pd.DataFrame]: + """Load an uploaded file into one or more DataFrames.""" + out: dict[str, pd.DataFrame] = {} + name = getattr(uploaded_file, "name", "uploaded") + lower = name.lower() + + if lower.endswith(".zip"): + raw = uploaded_file.read() + zf = zipfile.ZipFile(io.BytesIO(raw)) + for member in zf.namelist(): + mlow = member.lower() + if not (mlow.endswith(".csv") or mlow.endswith(".xlsx") or mlow.endswith(".xls")): + continue + + with zf.open(member) as fp: + data = fp.read() + bio = io.BytesIO(data) + ds_name = f"{name}::{Path(member).name}" + if mlow.endswith(".csv"): + df = _read_csv_with_fallbacks(bio) + else: + bio.seek(0) + df = pd.read_excel(bio) + out[ds_name] = clean_dataset(df) + + if not out: + raise ValueError("ZIP contained no supported files (.csv/.xlsx/.xls).") + return out + + if lower.endswith(".csv"): + uploaded_file.seek(0) + df = _read_csv_with_fallbacks(uploaded_file) + out[name] = clean_dataset(df) + return out + + if lower.endswith(".xlsx") or lower.endswith(".xls"): + uploaded_file.seek(0) + df = pd.read_excel(uploaded_file) + out[name] = clean_dataset(df) + return out + + raise ValueError("Unsupported file type. Please upload CSV, XLSX, or ZIP.") + + +def load_reference_dataset(uploaded_file) -> pd.DataFrame: + """Load a single reference (ground truth) file into a DataFrame.""" + if uploaded_file is None: + raise ValueError("No reference file provided") + + name = getattr(uploaded_file, "name", "reference") + lower = str(name).lower() + + if lower.endswith(".csv"): + uploaded_file.seek(0) + df = _read_csv_with_fallbacks(uploaded_file) + return clean_dataset(df) + + if lower.endswith(".xlsx") or lower.endswith(".xls"): + uploaded_file.seek(0) + df = pd.read_excel(uploaded_file) + return clean_dataset(df) + + if lower.endswith(".zip"): + uploaded_file.seek(0) + raw = uploaded_file.read() + zf = zipfile.ZipFile(io.BytesIO(raw)) + + supported = [] + for member in zf.namelist(): + mlow = member.lower() + if mlow.endswith(".csv") or mlow.endswith(".xlsx") or mlow.endswith(".xls"): + supported.append(member) + + if len(supported) == 0: + raise ValueError("ZIP contained no supported reference files (.csv/.xlsx/.xls).") + if len(supported) > 1: + raise ValueError( + "ZIP reference must contain exactly ONE supported file (.csv/.xlsx/.xls). " + f"Found {len(supported)}: {', '.join([Path(x).name for x in supported[:5]])}" + + (" ..." if len(supported) > 5 else "") + ) + + member = supported[0] + mlow = member.lower() + with zf.open(member) as fp: + data = fp.read() + bio = io.BytesIO(data) + if mlow.endswith(".csv"): + df = _read_csv_with_fallbacks(bio) + else: + bio.seek(0) + df = pd.read_excel(bio) + return clean_dataset(df) + + raise ValueError("Unsupported reference file type. Please upload CSV, XLSX, or ZIP (single file).") + + +# --------------------------------------------------------- +# [Datetime / Timeliness Helpers] +# --------------------------------------------------------- + + +def parse_datetime_series(s: pd.Series) -> pd.Series: + """Robust datetime parser for timeliness.""" + if s is None: + return pd.Series(dtype="datetime64[ns, UTC]") + + if pd.api.types.is_datetime64_any_dtype(s): + try: + if getattr(s.dt, "tz", None) is None: + return s.dt.tz_localize("UTC") + return s.dt.tz_convert("UTC") + except Exception: + return pd.to_datetime(s, errors="coerce", utc=True) + + if pd.api.types.is_numeric_dtype(s): + vals = pd.to_numeric(s, errors="coerce") + med = float(vals.dropna().median()) if vals.notna().any() else np.nan + + unit = None + if np.isfinite(med): + am = abs(med) + if am >= 1e18: + unit = "ns" + elif am >= 1e15: + unit = "us" + elif am >= 1e12: + unit = "ms" + elif am >= 1e9: + unit = "s" + + if unit is not None: + return pd.to_datetime(vals, unit=unit, errors="coerce", utc=True) + + return pd.to_datetime(vals, errors="coerce", utc=True) + + return pd.to_datetime(s.astype(str).str.strip(), errors="coerce", utc=True) + + +def compute_timeliness_latency(df: pd.DataFrame, event_col: str, system_col: str): + """Compute timeliness latency (seconds) with robust parsing.""" + meta = { + "auto_swapped": False, + "negative_latency_pairs": 0, + "valid_pairs_after_parse": 0, + "event_time_NaT": 0, + "system_time_NaT": 0, + } + + if df is None or df.empty: + return pd.Series(dtype="float"), pd.Series(dtype="datetime64[ns, UTC]"), pd.Series(dtype="datetime64[ns, UTC]"), meta + + if event_col in (None, "(None)") or system_col in (None, "(None)"): + return pd.Series(dtype="float"), pd.Series(dtype="datetime64[ns, UTC]"), pd.Series(dtype="datetime64[ns, UTC]"), meta + + t_event = parse_datetime_series(df[event_col]) + t_system = parse_datetime_series(df[system_col]) + + meta["event_time_NaT"] = int(t_event.isna().sum()) + meta["system_time_NaT"] = int(t_system.isna().sum()) + + lat = (t_system - t_event).dt.total_seconds() + lat_nonan = lat.dropna() + meta["valid_pairs_after_parse"] = int(len(lat_nonan)) + meta["negative_latency_pairs"] = int((lat_nonan < 0).sum()) + + if len(lat_nonan) > 0 and (lat_nonan >= 0).sum() == 0 and (lat_nonan < 0).sum() > 0: + lat_swapped = (t_event - t_system).dt.total_seconds() + lat_swapped_nonan = lat_swapped.dropna() + if len(lat_swapped_nonan) > 0 and (lat_swapped_nonan >= 0).sum() > 0: + meta["auto_swapped"] = True + lat = lat_swapped + meta["negative_latency_pairs"] = int((lat_swapped_nonan < 0).sum()) + meta["valid_pairs_after_parse"] = int(len(lat_swapped_nonan)) + + return lat, t_event, t_system, meta + + +# --------------------------------------------------------- +# [Completeness Helpers] +# --------------------------------------------------------- + + +def _parse_missing_tokens(text: str) -> list[str]: + """Parse missing tokens from comma/newline-separated user input.""" + if text is None: + return [] + raw = str(text) + parts = [] + for chunk in raw.replace("\n", ",").split(","): + tok = chunk.strip() + if tok: + parts.append(tok) + + seen = set() + out = [] + for t in parts: + if t not in seen: + out.append(t) + seen.add(t) + return out + + +def _build_missing_mask(df_in: pd.DataFrame, required_cols: list[str], missing_tokens: list[str]) -> pd.DataFrame: + """Return boolean mask DataFrame (True means missing) for required columns.""" + if df_in is None or df_in.empty: + return pd.DataFrame() + + cols = required_cols if required_cols else df_in.columns.tolist() + cols = [c for c in cols if c in df_in.columns] + df_req = df_in[cols] + + mask = df_req.isna() + tokens = [str(t).strip() for t in (missing_tokens or []) if str(t).strip() != ""] + + num_tokens: list[float] = [] + for t in tokens: + try: + num_tokens.append(float(t)) + except Exception: + pass + + for c in df_req.columns: + s = df_req[c] + if str(s.dtype) == "object" or pd.api.types.is_string_dtype(s): + s_str = s.astype("string") + mask[c] = mask[c] | s_str.isna() | (s_str.str.strip() == "") + if tokens: + mask[c] = mask[c] | (s_str.str.strip().isin(tokens)) + elif pd.api.types.is_numeric_dtype(s) and num_tokens: + try: + mask[c] = mask[c] | s.isin(num_tokens) + except Exception: + pass + + return mask + + +# --------------------------------------------------------- +# [Consistency Helpers] +# --------------------------------------------------------- + + +def find_col(df, keywords): + for c in df.columns: + if any(x in c for x in keywords): + return c + return "(None)" + + +def _extract_rule_columns(rule: str, df_columns: list[str]) -> list[str]: + """Extract columns referenced by a rule.""" + used = [] + + for name in re.findall(r"`([^`]+)`", rule or ""): + if name in df_columns and name not in used: + used.append(name) + + if used: + return used + + for c in df_columns: + try: + if re.search(rf"\b{re.escape(c)}\b", rule or ""): + used.append(c) + except re.error: + pass + + return used + + +def _prepare_df_for_rule(df: pd.DataFrame, rule: str) -> pd.DataFrame: + """Coerce object columns used in the rule to numeric when possible.""" + df2 = df.copy() + used_cols = _extract_rule_columns(rule, df2.columns.tolist()) + + for c in used_cols: + if str(df2[c].dtype) == "object": + s = df2[c].astype(str).str.replace(",", "", regex=False).str.strip() + df2[c] = pd.to_numeric(s, errors="coerce") + + sentinel_values = {-200, -999, -9999, 999, 9999, 99999} + for c in used_cols: + if pd.api.types.is_numeric_dtype(df2[c]): + df2[c] = df2[c].where(~df2[c].isin(sentinel_values), np.nan) + + return df2 + + +def normalize_consistency_rule(rule: str, df_columns: list[str]) -> str: + """Normalize a user-entered rule for pandas.DataFrame.query.""" + if not rule: + return rule + + def repl(m): + quote = m.group(1) + name = m.group(2) + if name in df_columns: + return f"`{name}`" + return f"{quote}{name}{quote}" + + return re.sub(r"(['\"])([^'\"]+)\1", repl, rule) + + +def safe_consistency_violations(df: pd.DataFrame, rule: str): + """Safely evaluate a consistency rule and return violating rows.""" + if df is None or df.empty: + return pd.DataFrame(), None + + if not rule or not str(rule).strip(): + return pd.DataFrame(), "No consistency rule provided." + + rule_norm = normalize_consistency_rule(str(rule).strip(), df.columns.tolist()) + df_prep = _prepare_df_for_rule(df, rule_norm) + + try: + passed_idx = df_prep.query(rule_norm, engine="python").index + violations = df.loc[~df.index.isin(passed_idx)].copy() + return violations, None + except Exception as e: + return pd.DataFrame(), str(e) + + +# --------------------------------------------------------- +# [Accuracy Helpers] +# --------------------------------------------------------- + + +def _likely_unit_mismatch(measured_col: str, true_col: str) -> bool: + """Heuristic guard for obvious measured/reference unit mismatch cases.""" + if not measured_col or not true_col: + return False + + m = str(measured_col) + t = str(true_col) + if m.startswith("PT08.") and t.endswith("(GT)"): + return True + return False + + +def infer_accuracy_task_type(df: pd.DataFrame, measured_col: str, true_col: str | None) -> str: + """Infer whether Accuracy should be treated as regression or classification.""" + if measured_col in (None, "(None)") or true_col in (None, "(None)"): + return "unknown" + + try: + m = df[measured_col] + t = df[true_col] + + if pd.api.types.is_numeric_dtype(m) and pd.api.types.is_numeric_dtype(t): + return "regression" + + def _numeric_like_ratio(s: pd.Series) -> float: + s_clean = s + if str(s.dtype) == "object": + s_clean = s.astype(str).str.replace(",", "", regex=False).str.strip() + nums = pd.to_numeric(s_clean, errors="coerce") + non_null = s.notna().sum() + if non_null == 0: + return 0.0 + return float(nums.notna().sum() / non_null) + + m_ratio = _numeric_like_ratio(m) + t_ratio = _numeric_like_ratio(t) + + if m_ratio >= 0.9 and t_ratio >= 0.9: + return "regression" + + return "classification" + except Exception: + return "unknown" + + +def compute_accuracy( + df: pd.DataFrame, + measured_col: str, + true_col: str, + mode: str = "Direct Compare", + tolerance_type: str = "absolute", + tolerance_value: float = 0.0, +): + """Compute Accuracy as threshold compliance against ground truth.""" + if measured_col in (None, "(None)") or true_col in (None, "(None)"): + return None, None, None, 0, 0, 0, {"task": "unknown"} + + if measured_col == true_col: + return None, None, None, 0, 0, 0, {"task": "unknown"} + + task = infer_accuracy_task_type(df, measured_col, true_col) + + if task == "classification": + y_true_raw = df[true_col] + y_pred_raw = df[measured_col] + + y_true = y_true_raw.replace(list(SENTINEL_MISSING_VALUES), np.nan) if hasattr(y_true_raw, "replace") else y_true_raw + y_pred = y_pred_raw.replace(list(SENTINEL_MISSING_VALUES), np.nan) if hasattr(y_pred_raw, "replace") else y_pred_raw + + y_true_s = y_true.astype("string") + y_pred_s = y_pred.astype("string") + + valid = y_true_s.notna() & y_pred_s.notna() + checked = int(valid.sum()) + if checked == 0: + return None, None, None, 0, 0, 0, {"task": "classification"} + + correct_mask = (y_true_s[valid] == y_pred_s[valid]) + accurate_count = int(correct_mask.sum()) + inaccurate_count = int(checked - accurate_count) + score = float((accurate_count / checked) * 100.0) + + meta = {"task": "classification"} + try: + yt_bin = pd.to_numeric(y_true_s[valid], errors="coerce") + yp_bin = pd.to_numeric(y_pred_s[valid], errors="coerce") + if yt_bin.notna().all() and yp_bin.notna().all(): + yt_set = set(yt_bin.unique().tolist()) + yp_set = set(yp_bin.unique().tolist()) + if yt_set.issubset({0, 1}) and yp_set.issubset({0, 1}): + TP = int(((yt_bin == 1) & (yp_bin == 1)).sum()) + TN = int(((yt_bin == 0) & (yp_bin == 0)).sum()) + FP = int(((yt_bin == 0) & (yp_bin == 1)).sum()) + FN = int(((yt_bin == 1) & (yp_bin == 0)).sum()) + meta.update({"TP": TP, "TN": TN, "FP": FP, "FN": FN}) + except Exception: + pass + + return score, None, None, checked, accurate_count, inaccurate_count, meta + + y_true = pd.to_numeric(df[true_col], errors="coerce") + y_meas = pd.to_numeric(df[measured_col], errors="coerce") + + if isinstance(y_true, pd.Series): + y_true = y_true.replace(list(SENTINEL_MISSING_VALUES), np.nan) + else: + y_true = np.nan if y_true in SENTINEL_MISSING_VALUES else y_true + + if isinstance(y_meas, pd.Series): + y_meas = y_meas.replace(list(SENTINEL_MISSING_VALUES), np.nan) + else: + y_meas = np.nan if y_meas in SENTINEL_MISSING_VALUES else y_meas + + valid = y_true.notna() & y_meas.notna() + checked = int(valid.sum()) + if checked == 0: + return None, None, None, 0, 0, 0, {"task": "regression"} + + calib_params = None + if mode == "Calibrated Compare (Linear)": + x = y_meas[valid].to_numpy() + y = y_true[valid].to_numpy() + A = np.c_[np.ones(len(x)), x] + coef, *_ = np.linalg.lstsq(A, y, rcond=None) + a, b = float(coef[0]), float(coef[1]) + calib_params = (a, b) + y_pred = a + b * y_meas + else: + y_pred = y_meas + + err = (y_pred[valid] - y_true[valid]).to_numpy() + abs_err = np.abs(err) + + mae = float(abs_err.mean()) + rmse = float(np.sqrt(np.mean(err ** 2))) + + tol_val = float(tolerance_value) if tolerance_value is not None else 0.0 + if tol_val <= 0: + return None, mae, rmse, checked, 0, checked, {"task": "regression", "calib_params": calib_params} + + if tolerance_type == "relative_percent": + tol = np.abs(y_true[valid].to_numpy()) * (tol_val / 100.0) + else: + tol = np.full_like(abs_err, tol_val, dtype=float) + + accurate_mask = abs_err <= tol + accurate_count = int(accurate_mask.sum()) + inaccurate_count = int(checked - accurate_count) + score = float((accurate_count / checked) * 100.0) + meta = {"task": "regression", "calib_params": calib_params} + return score, mae, rmse, checked, accurate_count, inaccurate_count, meta \ No newline at end of file diff --git a/expectations/.ge_store_backend_id b/expectations/.ge_store_backend_id deleted file mode 100644 index 856789955e130c696c0b20082f22f1dcd89b7aa5..0000000000000000000000000000000000000000 --- a/expectations/.ge_store_backend_id +++ /dev/null @@ -1 +0,0 @@ -store_backend_id = e70da95e-c461-47d6-93ef-4af79574f30a diff --git a/expectations/categories_suite.json b/expectations/categories_suite.json deleted file mode 100644 index ca3ccd939c028deee1f0c3abbb60a3dec1ddfa5b..0000000000000000000000000000000000000000 --- a/expectations/categories_suite.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "categories_suite", - "expectations": [], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/expectations/customers_quality_suite.json b/expectations/customers_quality_suite.json deleted file mode 100644 index 6269812d5ff685ec8c4f3ab80960286cee8c8f4a..0000000000000000000000000000000000000000 --- a/expectations/customers_quality_suite.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "customers_quality_suite", - "expectations": [ - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "CustomerID" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "Name" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "Age" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "Email" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "PurchaseDate" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_be_between", - "kwargs": { - "column": "Age", - "max_value": 100, - "min_value": 18 - }, - "meta": { - "etsi_section": "5.4", - "formula": "Accuracy Rate = (Correct values / Total values) \u00d7 100", - "metric": "Accuracy" - } - }, - { - "expectation_type": "expect_column_values_to_match_regex", - "kwargs": { - "column": "Email", - "regex": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" - }, - "meta": { - "etsi_section": "5.4", - "metric": "Accuracy" - } - } - ], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/expectations/products_quality_suite.json b/expectations/products_quality_suite.json deleted file mode 100644 index 51556529bac96a1a7b7404c7a5532495bffb6dac..0000000000000000000000000000000000000000 --- a/expectations/products_quality_suite.json +++ /dev/null @@ -1,91 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "products_quality_suite", - "expectations": [ - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "ProductID" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "ProductName" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "ListedWeight_kg" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "TrueWeight_kg" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_not_be_null", - "kwargs": { - "column": "IsAccurate" - }, - "meta": { - "etsi_section": "5.1", - "formula": "C = (Non-null values / Total values) \u00d7 100", - "metric": "Completeness" - } - }, - { - "expectation_type": "expect_column_values_to_be_between", - "kwargs": { - "column": "ListedWeight_kg", - "max_value": 100, - "min_value": 0 - }, - "meta": { - "etsi_section": "5.4", - "formula": "Accuracy Rate = (Correct values / Total values) \u00d7 100", - "metric": "Accuracy" - } - }, - { - "expectation_type": "expect_column_values_to_be_between", - "kwargs": { - "column": "TrueWeight_kg", - "max_value": 100, - "min_value": 0 - }, - "meta": { - "etsi_section": "5.4", - "formula": "Accuracy Rate = (Correct values / Total values) \u00d7 100", - "metric": "Accuracy" - } - } - ], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/expectations/products_suite.json b/expectations/products_suite.json deleted file mode 100644 index 67801ddbe50f4e5e66129273a034886cf5215d35..0000000000000000000000000000000000000000 --- a/expectations/products_suite.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "products_suite", - "expectations": [], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/expectations/pump_performance_suite.json b/expectations/pump_performance_suite.json deleted file mode 100644 index 2d3b7eb4d4b335f946e8870b3e82f9632f9e52b3..0000000000000000000000000000000000000000 --- a/expectations/pump_performance_suite.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "pump_performance_suite", - "expectations": [], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/expectations/test/demo.json b/expectations/test/demo.json deleted file mode 100644 index 6b5384175c7249138897a349e2eaa5309156f186..0000000000000000000000000000000000000000 --- a/expectations/test/demo.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "test.demo", - "expectations": [], - "ge_cloud_id": null, - "meta": { - "citations": [ - { - "batch_request": { - "data_asset_name": "default_data_asset_name", - "data_connector_name": "default_inferred_data_connector_name", - "datasource_name": "my_datasource", - "limit": 1000 - }, - "citation_date": "2025-09-30T05:37:36.938836Z", - "comment": "Created suite added via CLI" - } - ], - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/expectations/test2/demo.json b/expectations/test2/demo.json deleted file mode 100644 index be56bf30d903cdf069a663e49e0f24280f2275a6..0000000000000000000000000000000000000000 --- a/expectations/test2/demo.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "test2.demo", - "expectations": [ - { - "expectation_type": "expect_table_row_count_to_be_between", - "kwargs": { - "max_value": 6, - "min_value": 6 - }, - "meta": { - "profiler_details": { - "metric_configuration": { - "domain_kwargs": {}, - "metric_name": "table.row_count", - "metric_value_kwargs": null - }, - "num_batches": 1 - } - } - }, - { - "expectation_type": "expect_table_columns_to_match_set", - "kwargs": { - "column_set": [ - "ID", - "PumpID", - "Email", - "True_Weight", - "Vibration", - "Timestamp", - "Pressure", - "Listed_Weight", - "Temp_C" - ], - "exact_match": null - }, - "meta": { - "profiler_details": { - "success_ratio": 1.0 - } - } - } - ], - "ge_cloud_id": null, - "meta": { - "citations": [ - { - "citation_date": "2025-09-30T05:46:49.623306Z", - "comment": "Created by effective Rule-Based Profiler of OnboardingDataAssistant with the configuration included.\n" - } - ], - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/great_expectations.yml b/great_expectations.yml deleted file mode 100644 index 621f72d0f28272f673deb652a0a9a487fc85da1c..0000000000000000000000000000000000000000 --- a/great_expectations.yml +++ /dev/null @@ -1,135 +0,0 @@ -# Welcome to Great Expectations! Always know what to expect from your data. -# -# Here you can define datasources, batch kwargs generators, integrations and -# more. This file is intended to be committed to your repo. For help with -# configuration please: -# - Read our docs: https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview/#2-configure-your-datasource -# - Join our slack channel: http://greatexpectations.io/slack - -# config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility -# It is auto-generated and usually does not need to be changed. -config_version: 3.0 - -# Datasources tell Great Expectations where your data lives and how to get it. -# Read more at https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview -datasources: - my_datasource: - class_name: Datasource - module_name: great_expectations.datasource - execution_engine: - class_name: PandasExecutionEngine - module_name: great_expectations.execution_engine - data_connectors: - default_inferred_data_connector_name: - name: default_inferred_data_connector_name - class_name: InferredAssetFilesystemDataConnector - module_name: great_expectations.datasource.data_connector - base_directory: data - default_regex: - group_names: - - data_asset_name - pattern: (.*) - default_runtime_data_connector_name: - name: default_runtime_data_connector_name - class_name: RuntimeDataConnector - module_name: great_expectations.datasource.data_connector - assets: - my_runtime_asset_name: - class_name: Asset - module_name: great_expectations.datasource.data_connector.asset - batch_identifiers: - - runtime_batch_identifier_name - data_quality_datasource: - class_name: Datasource - module_name: great_expectations.datasource - execution_engine: - class_name: PandasExecutionEngine - module_name: great_expectations.execution_engine - data_connectors: - default_runtime_data_connector: - name: default_runtime_data_connector - class_name: RuntimeDataConnector - module_name: great_expectations.datasource.data_connector - batch_identifiers: - - default_identifier_name - etsi_datasource: - class_name: Datasource - module_name: great_expectations.datasource - execution_engine: - class_name: PandasExecutionEngine - module_name: great_expectations.execution_engine - data_connectors: - default_runtime_data_connector: - class_name: RuntimeDataConnector - module_name: great_expectations.datasource.data_connector - batch_identifiers: - - default_identifier -config_variables_file_path: uncommitted/config_variables.yml - -# The plugins_directory will be added to your python path for custom modules -# used to override and extend Great Expectations. -plugins_directory: plugins/ - -stores: -# Stores are configurable places to store things like Expectations, Validations -# Data Docs, and more. These are for advanced users only - most users can simply -# leave this section alone. -# -# Three stores are required: expectations, validations, and -# evaluation_parameters, and must exist with a valid store entry. Additional -# stores can be configured for uses such as data_docs, etc. - expectations_store: - class_name: ExpectationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: expectations/ - - validations_store: - class_name: ValidationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/validations/ - - evaluation_parameter_store: - class_name: EvaluationParameterStore - checkpoint_store: - class_name: CheckpointStore - store_backend: - class_name: TupleFilesystemStoreBackend - suppress_store_backend_id: true - base_directory: checkpoints/ - - profiler_store: - class_name: ProfilerStore - store_backend: - class_name: TupleFilesystemStoreBackend - suppress_store_backend_id: true - base_directory: profilers/ - -expectations_store_name: expectations_store -validations_store_name: validations_store -evaluation_parameter_store_name: evaluation_parameter_store -checkpoint_store_name: checkpoint_store - -data_docs_sites: - # Data Docs make it simple to visualize data quality in your project. These - # include Expectations, Validations & Profiles. The are built for all - # Datasources from JSON artifacts in the local repo including validations & - # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/docs/terms/data_docs - local_site: - class_name: SiteBuilder - show_how_to_buttons: true - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/data_docs/local_site/ - site_index_builder: - class_name: DefaultSiteIndexBuilder - -anonymous_usage_statistics: - data_context_id: e70da95e-c461-47d6-93ef-4af79574f30a - enabled: true -notebooks: -include_rendered_content: - globally: false - expectation_suite: false - expectation_validation_result: false diff --git a/gx/gx/.gitignore b/gx/gx/.gitignore deleted file mode 100644 index f34131a7443969599b8a138f17d52351957fe72d..0000000000000000000000000000000000000000 --- a/gx/gx/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ - -uncommitted/ \ No newline at end of file diff --git a/gx/gx/expectations/.ge_store_backend_id b/gx/gx/expectations/.ge_store_backend_id deleted file mode 100644 index 672328f85ff5ceadcb9c63efdbab4bb07721db41..0000000000000000000000000000000000000000 --- a/gx/gx/expectations/.ge_store_backend_id +++ /dev/null @@ -1 +0,0 @@ -store_backend_id = 0f99771b-00ff-4f41-bb65-9f75096419bd diff --git a/gx/gx/expectations/categories_suite.json b/gx/gx/expectations/categories_suite.json deleted file mode 100644 index ca3ccd939c028deee1f0c3abbb60a3dec1ddfa5b..0000000000000000000000000000000000000000 --- a/gx/gx/expectations/categories_suite.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "categories_suite", - "expectations": [], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/gx/gx/expectations/products_suite.json b/gx/gx/expectations/products_suite.json deleted file mode 100644 index 67801ddbe50f4e5e66129273a034886cf5215d35..0000000000000000000000000000000000000000 --- a/gx/gx/expectations/products_suite.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "products_suite", - "expectations": [], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/gx/gx/expectations/pump_performance_suite.json b/gx/gx/expectations/pump_performance_suite.json deleted file mode 100644 index 2d3b7eb4d4b335f946e8870b3e82f9632f9e52b3..0000000000000000000000000000000000000000 --- a/gx/gx/expectations/pump_performance_suite.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "data_asset_type": null, - "expectation_suite_name": "pump_performance_suite", - "expectations": [], - "ge_cloud_id": null, - "meta": { - "great_expectations_version": "0.18.12" - } -} \ No newline at end of file diff --git a/gx/gx/great_expectations.yml b/gx/gx/great_expectations.yml deleted file mode 100644 index f7534be9e5b7f581e05a6fa84717c45b6eda8aa9..0000000000000000000000000000000000000000 --- a/gx/gx/great_expectations.yml +++ /dev/null @@ -1,115 +0,0 @@ -# Welcome to Great Expectations! Always know what to expect from your data. -# -# Here you can define datasources, batch kwargs generators, integrations and -# more. This file is intended to be committed to your repo. For help with -# configuration please: -# - Read our docs: https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview/#2-configure-your-datasource -# - Join our slack channel: http://greatexpectations.io/slack - -# config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility -# It is auto-generated and usually does not need to be changed. -config_version: 3.0 - -# Datasources tell Great Expectations where your data lives and how to get it. -# Read more at https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/connect_to_data_overview -datasources: - etsi_datasource: - class_name: Datasource - module_name: great_expectations.datasource - execution_engine: - class_name: PandasExecutionEngine - module_name: great_expectations.execution_engine - data_connectors: - default_runtime_data_connector: - class_name: RuntimeDataConnector - module_name: great_expectations.datasource.data_connector - batch_identifiers: - - default_identifier - -# This config file supports variable substitution which enables: 1) keeping -# secrets out of source control & 2) environment-based configuration changes -# such as staging vs prod. -# -# When GX encounters substitution syntax (like `my_key: ${my_value}` or -# `my_key: $my_value`) in the great_expectations.yml file, it will attempt -# to replace the value of `my_key` with the value from an environment -# variable `my_value` or a corresponding key read from this config file, -# which is defined through the `config_variables_file_path`. -# Environment variables take precedence over variables defined here. -# -# Substitution values defined here can be a simple (non-nested) value, -# nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) -# -# -# https://docs.greatexpectations.io/docs/guides/setup/configuring_data_contexts/how_to_configure_credentials - - -config_variables_file_path: uncommitted/config_variables.yml - -# The plugins_directory will be added to your python path for custom modules -# used to override and extend Great Expectations. -plugins_directory: plugins/ - -stores: -# Stores are configurable places to store things like Expectations, Validations -# Data Docs, and more. These are for advanced users only - most users can simply -# leave this section alone. -# -# Three stores are required: expectations, validations, and -# evaluation_parameters, and must exist with a valid store entry. Additional -# stores can be configured for uses such as data_docs, etc. - expectations_store: - class_name: ExpectationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: expectations/ - - validations_store: - class_name: ValidationsStore - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/validations/ - - evaluation_parameter_store: - class_name: EvaluationParameterStore - checkpoint_store: - class_name: CheckpointStore - store_backend: - class_name: TupleFilesystemStoreBackend - suppress_store_backend_id: true - base_directory: checkpoints/ - - profiler_store: - class_name: ProfilerStore - store_backend: - class_name: TupleFilesystemStoreBackend - suppress_store_backend_id: true - base_directory: profilers/ - -expectations_store_name: expectations_store -validations_store_name: validations_store -evaluation_parameter_store_name: evaluation_parameter_store -checkpoint_store_name: checkpoint_store - -data_docs_sites: - # Data Docs make it simple to visualize data quality in your project. These - # include Expectations, Validations & Profiles. The are built for all - # Datasources from JSON artifacts in the local repo including validations & - # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/docs/terms/data_docs - local_site: - class_name: SiteBuilder - show_how_to_buttons: true - store_backend: - class_name: TupleFilesystemStoreBackend - base_directory: uncommitted/data_docs/local_site/ - site_index_builder: - class_name: DefaultSiteIndexBuilder - -anonymous_usage_statistics: - data_context_id: 0f99771b-00ff-4f41-bb65-9f75096419bd - enabled: true -notebooks: -include_rendered_content: - globally: false - expectation_suite: false - expectation_validation_result: false diff --git a/gx/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/gx/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css deleted file mode 100644 index 8bf5a15216a855a8681e0981a635a45131f2f2c7..0000000000000000000000000000000000000000 --- a/gx/gx/plugins/custom_data_docs/styles/data_docs_custom_styles.css +++ /dev/null @@ -1,22 +0,0 @@ -/*index page*/ -.ge-index-page-site-name-title {} -.ge-index-page-table-container {} -.ge-index-page-table {} -.ge-index-page-table-profiling-links-header {} -.ge-index-page-table-expectations-links-header {} -.ge-index-page-table-validations-links-header {} -.ge-index-page-table-profiling-links-list {} -.ge-index-page-table-profiling-links-item {} -.ge-index-page-table-expectation-suite-link {} -.ge-index-page-table-validation-links-list {} -.ge-index-page-table-validation-links-item {} - -/*breadcrumbs*/ -.ge-breadcrumbs {} -.ge-breadcrumbs-item {} - -/*navigation sidebar*/ -.ge-navigation-sidebar-container {} -.ge-navigation-sidebar-content {} -.ge-navigation-sidebar-title {} -.ge-navigation-sidebar-link {} diff --git a/init_gx_project.py b/init_gx_project.py deleted file mode 100644 index 353fcf9c4ed3c1c8b6accef0476c24a0fe3f2398..0000000000000000000000000000000000000000 --- a/init_gx_project.py +++ /dev/null @@ -1,32 +0,0 @@ -# init_gx_project.py -# GX 설정 파일(great_expectation.yml)에 데이터 소스(Datasource) 추가 -import great_expectations as gx -from great_expectations.checkpoint import Checkpoint -import os - -# GX 컨텍스트 생성 -context = gx.get_context() - -# 데이터 소스 추가 -datasource_name = "data_quality_datasource" - -# Pandas 데이터소스 설정 -datasource_config = { - "name": datasource_name, - "class_name": "Datasource", - "execution_engine": { - "class_name": "PandasExecutionEngine" - }, - "data_connectors": { - "default_runtime_data_connector": { - "class_name": "RuntimeDataConnector", - "batch_identifiers": ["default_identifier_name"] - } - } -} - -try: - context.add_datasource(**datasource_config) - print(f"데이터소스 '{datasource_name}' 생성 완료") -except Exception as e: - print(f"데이터소스가 이미 존재하거나 오류 발생: {e}") \ No newline at end of file diff --git a/plugins/custom_data_docs/styles/data_docs_custom_styles.css b/plugins/custom_data_docs/styles/data_docs_custom_styles.css deleted file mode 100644 index 8bf5a15216a855a8681e0981a635a45131f2f2c7..0000000000000000000000000000000000000000 --- a/plugins/custom_data_docs/styles/data_docs_custom_styles.css +++ /dev/null @@ -1,22 +0,0 @@ -/*index page*/ -.ge-index-page-site-name-title {} -.ge-index-page-table-container {} -.ge-index-page-table {} -.ge-index-page-table-profiling-links-header {} -.ge-index-page-table-expectations-links-header {} -.ge-index-page-table-validations-links-header {} -.ge-index-page-table-profiling-links-list {} -.ge-index-page-table-profiling-links-item {} -.ge-index-page-table-expectation-suite-link {} -.ge-index-page-table-validation-links-list {} -.ge-index-page-table-validation-links-item {} - -/*breadcrumbs*/ -.ge-breadcrumbs {} -.ge-breadcrumbs-item {} - -/*navigation sidebar*/ -.ge-navigation-sidebar-container {} -.ge-navigation-sidebar-content {} -.ge-navigation-sidebar-title {} -.ge-navigation-sidebar-link {} diff --git a/plugins/custom_expectations.py b/plugins/custom_expectations.py deleted file mode 100644 index 8005a58ea85675d10b7f34771b896e294c4807bd..0000000000000000000000000000000000000000 --- a/plugins/custom_expectations.py +++ /dev/null @@ -1,53 +0,0 @@ -# custom_expectations.py -#기초 품질 검증 (행 전체가 동일한 중복 데이터 찾아내는 규칙, 날짜 형식이 맞는 지 체크) -from great_expectations.execution_engine import PandasExecutionEngine -from great_expectations.expectations.expectation import ColumnMapExpectation -from great_expectations.expectations.metrics import ( - ColumnMapMetricProvider, - column_condition_partial, -) -import pandas as pd -from typing import Optional - -# 1. Redundancy (중복) 체크를 위한 커스텀 Expectation -class ExpectNoDuplicateRows(ColumnMapExpectation): - """전체 행의 중복을 체크하는 커스텀 Expectation""" - - metric_dependencies = ("table.row_count",) - - @classmethod - def _atomic_prescriptive_template(cls, **kwargs): - return "데이터셋에 중복된 행이 없어야 합니다." - - def validate_configuration(self, configuration: Optional[dict] = None) -> None: - super().validate_configuration(configuration) - - -# 2. Timeliness (적시성) - 날짜 형식 검증 -class ExpectColumnValuesToBeValidDate(ColumnMapExpectation): - """날짜 형식이 올바른지 검증""" - - map_metric = "column_values.valid_date" - - @classmethod - def _atomic_prescriptive_template(cls, **kwargs): - return "모든 날짜 값이 YYYY-MM-DD 형식이어야 합니다." - - -class ColumnValuesValidDate(ColumnMapMetricProvider): - """날짜 유효성을 체크하는 메트릭""" - - metric_name = "column_values.valid_date" - - @column_condition_partial(engine=PandasExecutionEngine) - def _pandas(cls, column, **kwargs): - def is_valid_date(val): - if pd.isna(val): - return False - try: - pd.to_datetime(val, format='%Y-%m-%d') - return True - except: - return False - - return column.apply(is_valid_date) \ No newline at end of file