Febrl3 Dedupe
import pandas as pd
import altair as alt
alt.renderers.enable('mimetype')
df = pd.read_csv("./data/febrl/dataset3.csv", delimiter=", ", dtype={"date_of_birth":str}, engine="python")
df["cluster"] = df["rec_id"].apply(lambda x: "-".join(x.split('-')[:2]))
df.head(2)
from splink.duckdb.duckdb_linker import DuckDBLinker
settings = {
"unique_id_column_name": "rec_id",
"link_type": "dedupe_only",
}
linker = DuckDBLinker(df, settings)
linker.missingness_chart()
linker.profile_columns(list(df.columns))
potential_blocking_rules = [
"l.soc_sec_id = r.soc_sec_id",
"l.given_name = r.given_name",
"l.surname = r.surname",
"l.date_of_birth = r.date_of_birth",
"l.postcode = r.postcode"
]
linker.cumulative_num_comparisons_from_blocking_rules_chart(potential_blocking_rules)
from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_level_library as cll
import splink.duckdb.duckdb_comparison_library as cl
settings = {
"unique_id_column_name": "rec_id",
"link_type": "dedupe_only",
"blocking_rules_to_generate_predictions": potential_blocking_rules,
"comparisons": [
{
"output_column_name": "Given name",
"comparison_levels": [
cll.null_level("given_name"),
cll.exact_match_level("given_name", term_frequency_adjustments=True),
{
"sql_condition": '"given_name_l" = "surname_r"',
"label_for_charts": "Exact match on reversed cols, l to r",
},
cll.distance_function_level(
"given_name", "jaro_winkler_similarity", 0.9
),
cll.distance_function_level(
"given_name", "jaro_winkler_similarity", 0.7
),
{
"sql_condition": "jaro_winkler_similarity(given_name_l, surname_r) > 0.7",
"label_for_charts": "Jar on reversed cols, l to r",
},
cll.else_level(),
],
},
{
"output_column_name": "Surname",
"comparison_levels": [
cll.null_level("surname"),
cll.exact_match_level("surname", term_frequency_adjustments=True),
{
"sql_condition": '"given_name_r" = "surname_l"',
"label_for_charts": "Exact match on reversed cols, r to l",
},
cll.distance_function_level("surname", "jaro_winkler_similarity", 0.9),
cll.distance_function_level("surname", "jaro_winkler_similarity", 0.7),
{
"sql_condition": "jaro_winkler_similarity(given_name_r, surname_l) > 0.7",
"label_for_charts": "Jaro on reversed cols, l to r",
},
cll.else_level(),
],
},
cl.levenshtein_at_thresholds(
"date_of_birth", [1, 2], term_frequency_adjustments=True
),
cl.levenshtein_at_thresholds("soc_sec_id", [2]),
cl.exact_match("street_number", term_frequency_adjustments=True),
cl.exact_match("postcode", term_frequency_adjustments=True),
],
"retain_intermediate_calculation_columns": True,
}
linker = DuckDBLinker(df, settings)
deterministic_rules = [
"l.soc_sec_id = r.soc_sec_id",
"l.given_name = r.given_name and l.surname = r.surname and l.date_of_birth = r.date_of_birth",
"l.given_name = r.surname and l.surname = r.given_name and l.date_of_birth = r.date_of_birth"
]
linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)
linker.estimate_u_using_random_sampling(target_rows=1e6)
comparison = linker._settings_obj.comparisons[2].as_dict()
session_dob = linker.estimate_parameters_using_expectation_maximisation("substr(l.date_of_birth,1,3) = substr(r.date_of_birth,1,3)")
session_postcode = linker.estimate_parameters_using_expectation_maximisation("substr(l.postcode,1,2) = substr(r.postcode,1,2)")
linker.match_weights_chart()
results = linker.predict(threshold_match_probability=0.2)
linker.roc_chart_from_labels_column("cluster")
pred_errors_df = linker.prediction_errors_from_labels_column("cluster").as_pandas_dataframe()
len(pred_errors_df)
pred_errors_df.head()
records = linker.prediction_errors_from_labels_column("cluster").as_record_dict(limit=10)
linker.waterfall_chart(records)