Skip to content

Evaluation from ground truth column

Evaluation when you have fully labelled data¶

In this example, our data contains a fully-populated ground-truth column called cluster that enables us to perform accuracy analysis of the final model

from splink.datasets import splink_datasets
import altair as alt
alt.renderers.enable("html")

df = splink_datasets.fake_1000

df.head(2)
unique_id first_name surname dob city email cluster
0 0 Robert Alan 1971-06-24 NaN robert255@smith.net 0
1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net 0
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.comparison_library as cl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        block_on("first_name"),
        block_on("surname"),
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("city", term_frequency_adjustments=True),
        ctl.email_comparison("email", include_username_fuzzy_level=False),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)
session_dob = linker.estimate_parameters_using_expectation_maximisation(block_on("dob"))
session_email = linker.estimate_parameters_using_expectation_maximisation(block_on("email"))
linker.truth_space_table_from_labels_column(
    "cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)
truth_threshold match_probability row_count p n tp tn fp fn P_rate ... precision recall specificity npv accuracy f1 f2 f0_5 p4 phi
0 -24.3 4.841400e-08 4353.0 2031.0 2322.0 2031.0 0.0 2322.0 0.0 0.466575 ... 0.466575 1.000000 0.000000 1.000000 0.466575 0.636278 0.813898 0.522296 0.000000 0.000000
1 -23.8 6.846774e-08 4353.0 2031.0 2322.0 2030.0 0.0 2322.0 1.0 0.466575 ... 0.466452 0.999508 0.000000 0.000000 0.466345 0.636065 0.813562 0.522146 0.000000 -0.016208
2 -23.7 7.338190e-08 4353.0 2031.0 2322.0 2030.0 234.0 2088.0 1.0 0.466575 ... 0.492958 0.999508 0.100775 0.995745 0.520101 0.660270 0.829113 0.548560 0.286607 0.221379
3 -22.6 1.572975e-07 4353.0 2031.0 2322.0 2030.0 428.0 1894.0 1.0 0.466575 ... 0.517329 0.999508 0.184324 0.997669 0.564668 0.681780 0.842463 0.572573 0.427302 0.307690
4 -22.5 1.685873e-07 4353.0 2031.0 2322.0 2030.0 583.0 1739.0 1.0 0.466575 ... 0.538604 0.999508 0.251077 0.998288 0.600276 0.700000 0.853443 0.593324 0.510093 0.366792

5 rows × 25 columns

linker.roc_chart_from_labels_column("cluster")
linker.precision_recall_chart_from_labels_column("cluster")