Evaluation from ground truth column
Evaluation when you have fully labelled data¶
In this example, our data contains a fully-populated ground-truth column called cluster
that enables us to perform accuracy analysis of the final model
from splink.datasets import splink_datasets
import altair as alt
alt.renderers.enable("html")
df = splink_datasets.fake_1000
df.head(2)
unique_id | first_name | surname | dob | city | cluster | ||
---|---|---|---|---|---|---|---|
0 | 0 | Robert | Alan | 1971-06-24 | NaN | robert255@smith.net | 0 |
1 | 1 | Robert | Allen | 1971-05-24 | NaN | roberta25@smith.net | 0 |
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb.blocking_rule_library import block_on
import splink.duckdb.comparison_template_library as ctl
import splink.duckdb.comparison_library as cl
settings = {
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": [
block_on("first_name"),
block_on("surname"),
],
"comparisons": [
ctl.name_comparison("first_name"),
ctl.name_comparison("surname"),
ctl.date_comparison("dob", cast_strings_to_date=True),
cl.exact_match("city", term_frequency_adjustments=True),
ctl.email_comparison("email", include_username_fuzzy_level=False),
],
"retain_matching_columns": True,
"retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
"l.email = r.email"
]
linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)
session_dob = linker.estimate_parameters_using_expectation_maximisation(block_on("dob"))
session_email = linker.estimate_parameters_using_expectation_maximisation(block_on("email"))
linker.truth_space_table_from_labels_column(
"cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)
truth_threshold | match_probability | row_count | p | n | tp | tn | fp | fn | P_rate | ... | precision | recall | specificity | npv | accuracy | f1 | f2 | f0_5 | p4 | phi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -24.3 | 4.841400e-08 | 4353.0 | 2031.0 | 2322.0 | 2031.0 | 0.0 | 2322.0 | 0.0 | 0.466575 | ... | 0.466575 | 1.000000 | 0.000000 | 1.000000 | 0.466575 | 0.636278 | 0.813898 | 0.522296 | 0.000000 | 0.000000 |
1 | -23.8 | 6.846774e-08 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 0.0 | 2322.0 | 1.0 | 0.466575 | ... | 0.466452 | 0.999508 | 0.000000 | 0.000000 | 0.466345 | 0.636065 | 0.813562 | 0.522146 | 0.000000 | -0.016208 |
2 | -23.7 | 7.338190e-08 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 234.0 | 2088.0 | 1.0 | 0.466575 | ... | 0.492958 | 0.999508 | 0.100775 | 0.995745 | 0.520101 | 0.660270 | 0.829113 | 0.548560 | 0.286607 | 0.221379 |
3 | -22.6 | 1.572975e-07 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 428.0 | 1894.0 | 1.0 | 0.466575 | ... | 0.517329 | 0.999508 | 0.184324 | 0.997669 | 0.564668 | 0.681780 | 0.842463 | 0.572573 | 0.427302 | 0.307690 |
4 | -22.5 | 1.685873e-07 | 4353.0 | 2031.0 | 2322.0 | 2030.0 | 583.0 | 1739.0 | 1.0 | 0.466575 | ... | 0.538604 | 0.999508 | 0.251077 | 0.998288 | 0.600276 | 0.700000 | 0.853443 | 0.593324 | 0.510093 | 0.366792 |
5 rows × 25 columns
linker.roc_chart_from_labels_column("cluster")
linker.precision_recall_chart_from_labels_column("cluster")