Linking two tables of persons
import pandas as pd
df_l = pd.read_parquet("./data/fake_df_l.parquet")
df_r = pd.read_parquet("./data/fake_df_r.parquet")
df_l.head(2)
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
"link_type": "link_only",
"blocking_rules_to_generate_predictions": [
"l.first_name = r.first_name",
"l.surname = r.surname",
],
"comparisons": [
cl.levenshtein_at_thresholds("first_name", 2),
cl.levenshtein_at_thresholds("surname"),
cl.levenshtein_at_thresholds("dob"),
cl.exact_match("city", term_frequency_adjustments=True),
cl.levenshtein_at_thresholds("email"),
],
}
linker = DuckDBLinker([df_l, df_r], settings, input_table_aliases=["df_left", "df_right"])
deterministic_rules = [
"l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
"l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
"l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
"l.email = r.email"
]
linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
linker.estimate_u_using_random_sampling(target_rows=1e6)
session_dob = linker.estimate_parameters_using_expectation_maximisation("l.dob = r.dob")
session_email = linker.estimate_parameters_using_expectation_maximisation("l.email = r.email")
results = linker.predict(threshold_match_probability=0.9)
results.as_pandas_dataframe(limit=5)