QA from ground truth column
Quality assurance when you have fully labelled data¶
In this example, our data contains a fully-populated ground-truth column called cluster
that enables us to perform accuracy analysis of the final model
import pandas as pd
import altair as alt
alt.renderers.enable("mimetype")
df = pd.read_csv("./data/fake_1000.csv")
df.head(2)
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
"l.first_name = r.first_name",
"l.surname = r.surname",
],
"comparisons": [
cl.levenshtein_at_thresholds("first_name", 2),
cl.levenshtein_at_thresholds("surname", 2),
cl.levenshtein_at_thresholds("dob"),
cl.exact_match("city", term_frequency_adjustments=True),
cl.levenshtein_at_thresholds("email"),
],
"retain_matching_columns": True,
"retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
"l.email = r.email"
]
linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(target_rows=1e6)
session_dob = linker.estimate_parameters_using_expectation_maximisation("l.dob = r.dob")
session_email = linker.estimate_parameters_using_expectation_maximisation("l.email = r.email")
linker.truth_space_table_from_labels_column(
"cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)
linker.roc_chart_from_labels_column("cluster")
linker.precision_recall_chart_from_labels_column("cluster")
# Plot some false positives
linker.prediction_errors_from_labels_column(
"cluster", include_false_negatives=True, include_false_positives=True
).as_pandas_dataframe(limit=5)
records = linker.prediction_errors_from_labels_column(
"cluster", include_false_negatives=True, include_false_positives=True
).as_record_dict(limit=5)
linker.waterfall_chart(records)