Exercise 6.10
Data Quality Dashboard
The capstone: one function you run on any production export to decide whether it's fit to use. The starter builds a realistic field dataset carrying every fault from this chapter: a negative rate, a meter-failure blank, a duplicated export row, a reporting gap, and a spike.
Write data_quality_report(df) returning a dict with:
n_records: total rowsdate_range:(min_date, max_date)missing_by_col:{"oil_bopd": n, "water_bwpd": n}(NaN counts)n_impossible: rows with a negative oil or water raten_duplicates: repeatedwell_id+daterows (extra copies)gaps_total: total missing reporting months across all wellsoutliers_total: per-well oil readings beyond 3σreadiness_score: percent of records free of any hard fault
(missing or impossible or duplicate). Outliers and gaps are flagged for review but don't fail a record; a workover spike is real data.
A readiness score and a one-line fault breakdown is what tells a team in five seconds whether to trust the dataset or send it back.
Stuck? Reveal hints one at a time — they progress from nudge to near-solution.
visibilityReveal reference solutionexpand_more
Try solving it yourself first — the hints walk you through it. The solution below is one valid approach; yours may differ and still be correct.
import numpy as np
import pandas as pd
_d1 = pd.date_range("2023-01-01", periods=20, freq="MS")
_rows = []
for _i, _d in enumerate(_d1):
_rows.append({"well_id": "OD-001", "date": _d, "oil_bopd": 2400 - 12 * _i, "water_bwpd": 120 + 5 * _i, "days_on": 30})
for _i, _d in enumerate(_d1):
if _i == 15:
continue
_rows.append({"well_id": "OD-002", "date": _d, "oil_bopd": 1800 - 10 * _i, "water_bwpd": 300 + 8 * _i, "days_on": 30})
df = pd.DataFrame(_rows)
df.loc[(df["well_id"] == "OD-001") & (df["date"] == _d1[5]), "oil_bopd"] = -40
df.loc[(df["well_id"] == "OD-001") & (df["date"] == _d1[12]), "water_bwpd"] = np.nan
df.loc[(df["well_id"] == "OD-002") & (df["date"] == _d1[10]), "oil_bopd"] = 7000
_dup = df[(df["well_id"] == "OD-001") & (df["date"] == _d1[1])].copy()
df = pd.concat([df, _dup], ignore_index=True)
def data_quality_report(df):
n_records = len(df)
missing_by_col = {c: int(df[c].isna().sum()) for c in ["oil_bopd", "water_bwpd"]}
missing_mask = df["oil_bopd"].isna() | df["water_bwpd"].isna()
impossible_mask = (df["oil_bopd"] < 0) | (df["water_bwpd"] < 0)
duplicate_mask = df.duplicated(subset=["well_id", "date"], keep="first")
# Reporting gaps: missing months in each well's expected calendar.
gaps_total = 0
for _, g in df.groupby("well_id"):
present = set(pd.to_datetime(g["date"]))
expected = pd.date_range(g["date"].min(), g["date"].max(), freq="MS")
gaps_total += sum(1 for d in expected if d not in present)
# Statistical outliers: per-well oil beyond 3 sigma.
outliers_total = 0
for _, g in df.groupby("well_id"):
s = g["oil_bopd"].dropna()
mu, sd = s.mean(), s.std()
if sd and not pd.isna(sd):
outliers_total += int(((s - mu).abs() > 3 * sd).sum())
hard_fault = missing_mask | impossible_mask | duplicate_mask
readiness_score = 100.0 * (1 - int(hard_fault.sum()) / n_records)
return {
"n_records": n_records,
"date_range": (df["date"].min(), df["date"].max()),
"missing_by_col": missing_by_col,
"n_impossible": int(impossible_mask.sum()),
"n_duplicates": int(duplicate_mask.sum()),
"gaps_total": gaps_total,
"outliers_total": outliers_total,
"readiness_score": readiness_score,
}
r = data_quality_report(df)
print("=" * 48)
print("DATA QUALITY REPORT")
print("=" * 48)
print(f" Records: {r['n_records']}")
print(f" Date range: {r['date_range'][0].strftime('%Y-%m')} -> {r['date_range'][1].strftime('%Y-%m')}")
print(f" Missing: {r['missing_by_col']}")
print(f" Impossible: {r['n_impossible']}")
print(f" Duplicates: {r['n_duplicates']}")
print(f" Gaps: {r['gaps_total']} month(s)")
print(f" Outliers: {r['outliers_total']}")
print(f" READINESS: {r['readiness_score']:.1f}%")
lockCopying code is a Full Access feature.