# # # import numpy as np
# # # import pandas as pd

# # # def preprocess_input(raw_input, encoders, scaler, mean_opv, feature_order, df_temp_prep):
# # #     processed = {}

# # #     # OPV Doses
# # #     opv = pd.to_numeric(raw_input.get("OPV Doses"), errors="coerce")
# # #     processed["OPV Doses"] = int(opv) if not pd.isna(opv) else int(mean_opv)

# # #     # Specimen Number
# # #     specimen = pd.to_numeric(raw_input.get("Specimen Number"), errors="coerce")
# # #     processed["Specimen Number"] = int(specimen) if not pd.isna(specimen) else 1

# # #     # Body Temp
# # #     temp = pd.to_numeric(raw_input.get("Body Temp"), errors="coerce")
# # #     processed["Body Temp"] = float(temp) if not pd.isna(temp) else df_temp_prep["Body Temp"].mean()

# # #     # Clean Province / District
# # #     province = raw_input["Province"].title()
# # #     district = " ".join(w.capitalize() for w in raw_input["District"].split())

# # #     # Encode Province
# # #     le_province = encoders["Province"]
# # #     if province in le_province.classes_:
# # #         province_enc = le_province.transform([province])[0]
# # #     else:
# # #         province_enc = np.median(le_province.transform(le_province.classes_))

# # #     # Encode District
# # #     le_district = encoders["District"]
# # #     if district in le_district.classes_:
# # #         district_enc = le_district.transform([district])[0]
# # #     else:
# # #         district_enc = np.median(le_district.transform(le_district.classes_))

# # #     # Scale Province + District
# # #     scaled = scaler.transform(pd.DataFrame([{
# # #         "Province_encoded": province_enc,
# # #         "District_encoded": district_enc
# # #     }]))

# # #     processed["Province"] = scaled[0, 0]
# # #     processed["District"] = scaled[0, 1]

# # #     # Other categorical fields
# # #     for col in ["Case/Contact", "Specimen condition", "Sore Throat", "Fatigue", "Limb Discomfort"]:
# # #         val = raw_input[col].capitalize()
# # #         enc = encoders[col]

# # #         if val in enc.classes_:
# # #             processed[col] = int(enc.transform([val])[0])
# # #         else:
# # #             mode_val = df_temp_prep[col].mode()[0]
# # #             processed[col] = int(enc.transform([mode_val])[0])

# # #     # Final dataframe in correct order
# # #     df = pd.DataFrame([processed])
# # #     return df[feature_order]
# # # app/preprocessing.py


# # import numpy as np
# # import pandas as pd

# # # Helper: normalize input strings consistently
# # def _normalize_text(s: str):
# #     if s is None:
# #         return "-"
# #     s = str(s).strip()
# #     if s == "-" or s == "":
# #         return "-"
# #     # Title case words (keeps multi-word names)
# #     return " ".join([w.capitalize() for w in s.split()])

# # def preprocess_input(raw_input: dict,
# #                      encoders: dict,
# #                      scaler,
# #                      mean_opv_doses: float,
# #                      final_feature_order: list):
# #     """
# #     raw_input: dict with keys:
# #       Province, District, Case/Contact, OPV Doses, Specimen Number,
# #       Specimen condition, Body Temp, Sore Throat, Fatigue, Limb Discomfort
# #     encoders: dict loaded from your label_encoders.pkl (LabelEncoder objects)
# #     scaler: loaded StandardScaler
# #     mean_opv_doses: numeric mean for OPV doses (fallback)
# #     final_feature_order: list of columns in the order your model expects
# #     """

# #     processed = {}

# #     # OPV Doses -> numeric, fallback to mean_opv_doses
# #     opv = raw_input.get("OPV Doses", None)
# #     if isinstance(opv, str) and opv.strip() == "-":
# #         processed["OPV Doses"] = int(mean_opv_doses)
# #     else:
# #         n = pd.to_numeric(opv, errors="coerce")
# #         processed["OPV Doses"] = int(n) if not pd.isna(n) else int(mean_opv_doses)

# #     # Specimen Number -> numeric default 1
# #     spec_num = raw_input.get("Specimen Number", None)
# #     n = pd.to_numeric(spec_num, errors="coerce")
# #     processed["Specimen Number"] = int(n) if not pd.isna(n) else 1

# #     # Body Temp -> numeric, fallback to 37.0 if missing
# #     bt = raw_input.get("Body Temp", None)
# #     btn = pd.to_numeric(bt, errors="coerce")
# #     processed["Body Temp"] = float(btn) if not pd.isna(btn) else 37.0

# #     # Province / District normalization and encoding
# #     user_province = _normalize_text(raw_input.get("Province", "-"))
# #     user_district = _normalize_text(raw_input.get("District", "-"))

# #     le_province = encoders.get("Province")
# #     le_district = encoders.get("District")

# #     # If unseen, use first class as fallback (we print a warning)
# #     if user_province in getattr(le_province, "classes_", []):
# #         prov_enc = int(le_province.transform([user_province])[0])
# #     else:
# #         # fallback to first class
# #         prov_enc = int(le_province.transform([le_province.classes_[0]])[0])

# #     if user_district in getattr(le_district, "classes_", []):
# #         dist_enc = int(le_district.transform([user_district])[0])
# #     else:
# #         dist_enc = int(le_district.transform([le_district.classes_[0]])[0])

# #     # Scale province/district numeric pair. Scaler expects a 2d array-like
# #     arr = np.array([[prov_enc, dist_enc]])
# #     scaled = scaler.transform(arr)
# #     processed["Province"] = float(scaled[0, 0])
# #     processed["District"] = float(scaled[0, 1])

# #     # Other categorical features and their encoder keys in your dict
# #     # Map keys in pickle to friendly input names
# #     mapping = {
# #         "Case/Contact": "Case_or_Contact",
# #         "Specimen condition": "Specimen_condition_on_receipt",
# #         "Sore Throat": "Sore Throat",
# #         "Fatigue": "Fatigue",
# #         "Limb Discomfort": "Limb Discomfort"
# #     }

# #     # For each, normalize -> encode -> if unseen -> fallback to first class
# #     for input_col, encoder_key in mapping.items():
# #         enc = encoders.get(encoder_key)
# #         raw_val = raw_input.get(input_col, "-")
# #         if input_col in ["Case/Contact", "Specimen condition"]:
# #             normalized = _normalize_text(raw_val)
# #         else:
# #             # Sore Throat / Fatigue / Limb Discomfort
# #             normalized = _normalize_text(raw_val)

# #         if normalized in getattr(enc, "classes_", []):
# #             encoded = int(enc.transform([normalized])[0])
# #         else:
# #             # fallback to first class
# #             encoded = int(enc.transform([enc.classes_[0]])[0])
# #         processed[input_col] = encoded

# #     # Build final df in exact order
# #     df = pd.DataFrame([processed])
# #     # Ensure columns order exists in final_feature_order
# #     # final_feature_order is expected to contain the names like in training
# #     df = df[final_feature_order]
# #     return df


# # preprocessing.py

# import pandas as pd

# # Ordinal mappings (must match training notebook exactly)
# province_map = {
#     "Addis Ababa": 0, "Amhara": 1, "Afar": 2, "Benishangul-Gumuz": 3,
#     "Gambela": 4, "Harari": 5, "Oromia": 6, "Sidama": 7,
#     "SNNP": 8, "Somali": 9, "Tigray": 10
# }

# case_contact_map = {"Case": 2, "Contact": 1}
# specimen_num_map = {"1": 1, "2": 2}
# opv_map = {"0": 0, "1": 1, "2": 2, "3": 3}
# specimen_cond_map = {"Good": 2, "Poor": 1}
# sore_throat_map = {"Mild": 1, "Moderate": 2, "Severe": 3}
# fatigue_map = {"Mild": 1, "Moderate": 2, "Severe": 3}
# limb_discomfort_map = {"Yes": 1, "No": 0}


# def preprocess_input(data_dict: dict) -> pd.DataFrame:
#     """
#     Converts API input into the exact numeric format model expects.
#     """

#     df = pd.DataFrame([{
#         "Province": province_map.get(data_dict["Province"], 0),
#         "District": data_dict["District"],  # text left as-is
#         "Case/Contact": case_contact_map.get(data_dict["Case/Contact"], 1),
#         "OPV Doses": opv_map.get(data_dict["OPV Doses"], 0),
#         "Specimen Number": specimen_num_map.get(data_dict["Specimen Number"], 1),
#         "Specimen condition": specimen_cond_map.get(data_dict["Specimen condition"], 1),
#         "Body Temp": float(data_dict["Body Temp"]),
#         "Sore Throat": sore_throat_map.get(data_dict["Sore Throat"], 1),
#         "Fatigue": fatigue_map.get(data_dict["Fatigue"], 1),
#         "Limb Discomfort": limb_discomfort_map.get(data_dict["Limb Discomfort"], 0)
#     }])

#     return df


import pandas as pd
import numpy as np
from app.load_artifacts import (
    le_province, le_district, le_case_contact, le_specimen_condition,
    le_sore_throat, le_fatigue, le_limb_discomfort,
    loaded_scaler
)

mean_opv_doses = 2.0

final_feature_order = [
    'Province', 'District', 'Case/Contact', 'OPV Doses', 'Specimen Number',
    'Specimen condition', 'Body Temp', 'Sore Throat', 'Fatigue', 'Limb Discomfort'
]


def preprocess_single_input(raw_input_dict):

    # Convert FastAPI keys to notebook-compatible keys
    raw_input_dict = {
        "Province": raw_input_dict["Province"],
        "District": raw_input_dict["District"],
        "Case/Contact": raw_input_dict["Case_Contact"],
        "OPV Doses": raw_input_dict["OPV_Doses"],
        "Specimen Number": raw_input_dict["Specimen_Number"],
        "Specimen condition": raw_input_dict["Specimen_condition"],
        "Body Temp": raw_input_dict["Body_Temp"],
        "Sore Throat": raw_input_dict["Sore_Throat"],
        "Fatigue": raw_input_dict["Fatigue"],
        "Limb Discomfort": raw_input_dict["Limb_Discomfort"],
    }

    processed_input = {}
    df = pd.DataFrame([raw_input_dict])

    # ---------- Numeric fields ----------
    opv = pd.to_numeric(df["OPV Doses"], errors="coerce")
    processed_input["OPV Doses"] = int(opv.fillna(mean_opv_doses).iloc[0])

    sn = pd.to_numeric(df["Specimen Number"], errors="coerce")
    processed_input["Specimen Number"] = int(sn.fillna(1).iloc[0])

    bt = pd.to_numeric(df["Body Temp"], errors="coerce")
    processed_input["Body Temp"] = float(bt.fillna(37.0).iloc[0])

    # ---------- Encode province and district ----------
    prov = raw_input_dict["Province"].title()
    dist = " ".join([w.capitalize() for w in raw_input_dict["District"].split()])

    processed_input["Province"] = le_province.transform(
        [prov if prov in le_province.classes_ else le_province.classes_[0]]
    )[0]

    processed_input["District"] = le_district.transform(
        [dist if dist in le_district.classes_ else le_district.classes_[0]]
    )[0]

    # scale Province + District
    scaled = loaded_scaler.transform(pd.DataFrame([{
        "Province": processed_input["Province"],
        "District": processed_input["District"]
    }]))
    processed_input["Province"] = scaled[0, 0]
    processed_input["District"] = scaled[0, 1]

    # ---------- Other categorical fields ----------
    mapping = {
        "Case/Contact": le_case_contact,
        "Specimen condition": le_specimen_condition,
        "Sore Throat": le_sore_throat,
        "Fatigue": le_fatigue,
        "Limb Discomfort": le_limb_discomfort,
    }

    for col, encoder in mapping.items():
        val = raw_input_dict[col].capitalize()
        if val not in encoder.classes_:
            val = encoder.classes_[0]
        processed_input[col] = encoder.transform([val])[0]

    # ---------- Arrange DataFrame ----------
    final_df = pd.DataFrame([processed_input])[final_feature_order]
    return final_df