Linking financial transactions

Linking banking transactions¶

This example shows how to perform a one-to-one link on banking transactions.

The data is fake data, and was generated has the following features:

Money shows up in the destination account with some time delay
The amount sent and the amount received are not always the same - there are hidden fees and foreign exchange effects
The memo is sometimes truncated and content is sometimes missing

Since each origin payment should end up in the destination account, the probability_two_random_records_match of the model is known.

from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets

df_origin = splink_datasets.transactions_origin
df_destination = splink_datasets.transactions_destination

display(df_origin.head(2))
display(df_destination.head(2))

	ground_truth	memo	transaction_date	amount	unique_id
0	0	MATTHIAS C paym	2022-03-28	36.36	0
1	1	M CORVINUS dona	2022-02-14	221.91	1

	ground_truth	memo	transaction_date	amount	unique_id
0	0	MATTHIAS C payment BGC	2022-03-29	36.36	0
1	1	M CORVINUS BGC	2022-02-16	221.91	1

In the following chart, we can see this is a challenging dataset to link:

There are only 151 distinct transaction dates, with strong skew
Some 'memos' are used multiple times (up to 48 times)
There is strong skew in the 'amount' column, with 1,400 transactions of around 60.00

from splink.exploratory import profile_columns

db_api = DuckDBAPI()
profile_columns(
    [df_origin, df_destination],
    db_api=db_api,
    column_expressions=[
        "memo",
        "transaction_date",
        "amount",
    ],
)

from splink import DuckDBAPI, block_on
from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

# Design blocking rules that allow for differences in transaction date and amounts
blocking_rule_date_1 = """
    strftime(l.transaction_date, '%Y%m') = strftime(r.transaction_date, '%Y%m')
    and substr(l.memo, 1,3) = substr(r.memo,1,3)
    and l.amount/r.amount > 0.7   and l.amount/r.amount < 1.3
"""

# Offset by half a month to ensure we capture case when the dates are e.g. 31st Jan and 1st Feb
blocking_rule_date_2 = """
    strftime(l.transaction_date+15, '%Y%m') = strftime(r.transaction_date, '%Y%m')
    and substr(l.memo, 1,3) = substr(r.memo,1,3)
    and l.amount/r.amount > 0.7   and l.amount/r.amount < 1.3
"""

blocking_rule_memo = block_on("substr(memo,1,9)")

blocking_rule_amount_1 = """
round(l.amount/2,0)*2 = round(r.amount/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date)
"""

blocking_rule_amount_2 = """
round(l.amount/2,0)*2 = round((r.amount+1)/2,0)*2 and yearweek(r.transaction_date) = yearweek(l.transaction_date + 4)
"""

blocking_rule_cheat = block_on("unique_id")


brs = [
    blocking_rule_date_1,
    blocking_rule_date_2,
    blocking_rule_memo,
    blocking_rule_amount_1,
    blocking_rule_amount_2,
    blocking_rule_cheat,
]


db_api = DuckDBAPI()

cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[df_origin, df_destination],
    blocking_rules=brs,
    db_api=db_api,
    link_type="link_only"
)

# Full settings for linking model
import splink.comparison_level_library as cll
import splink.comparison_library as cl

comparison_amount = {
    "output_column_name": "amount",
    "comparison_levels": [
        cll.NullLevel("amount"),
        cll.ExactMatchLevel("amount"),
        cll.PercentageDifferenceLevel("amount", 0.01),
        cll.PercentageDifferenceLevel("amount", 0.03),
        cll.PercentageDifferenceLevel("amount", 0.1),
        cll.PercentageDifferenceLevel("amount", 0.3),
        cll.ElseLevel(),
    ],
    "comparison_description": "Amount percentage difference",
}

# The date distance is one sided becaause transactions should only arrive after they've left
# As a result, the comparison_template_library date difference functions are not appropriate
within_n_days_template = "transaction_date_r - transaction_date_l <= {n} and transaction_date_r >= transaction_date_l"

comparison_date = {
    "output_column_name": "transaction_date",
    "comparison_levels": [
        cll.NullLevel("transaction_date"),
        {
            "sql_condition": within_n_days_template.format(n=1),
            "label_for_charts": "1 day",
        },
        {
            "sql_condition": within_n_days_template.format(n=4),
            "label_for_charts": "<=4 days",
        },
        {
            "sql_condition": within_n_days_template.format(n=10),
            "label_for_charts": "<=10 days",
        },
        {
            "sql_condition": within_n_days_template.format(n=30),
            "label_for_charts": "<=30 days",
        },
        cll.ElseLevel(),
    ],
    "comparison_description": "Transaction date days apart",
}


settings = SettingsCreator(
    link_type="link_only",
    probability_two_random_records_match=1 / len(df_origin),
    blocking_rules_to_generate_predictions=[
        blocking_rule_date_1,
        blocking_rule_date_2,
        blocking_rule_memo,
        blocking_rule_amount_1,
        blocking_rule_amount_2,
        blocking_rule_cheat,
    ],
    comparisons=[
        comparison_amount,
        cl.LevenshteinAtThresholds("memo", [2, 6, 10]),
        comparison_date,
    ],
    retain_intermediate_calculation_columns=True,
)

linker = Linker(
    [df_origin, df_destination],
    settings,
    input_table_aliases=["__ori", "_dest"],
    db_api=db_api,
)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - amount (no m values are trained).
    - memo (no m values are trained).
    - transaction_date (no m values are trained).

linker.training.estimate_parameters_using_expectation_maximisation(block_on("memo"))

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."memo" = r."memo"

Parameter estimates will be made for the following comparison(s):
    - amount
    - transaction_date

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - memo

Iteration 1: Largest change in params was -0.588 in the m_probability of amount, level `Exact match on amount`
Iteration 2: Largest change in params was -0.176 in the m_probability of transaction_date, level `1 day`
Iteration 3: Largest change in params was 0.00996 in the m_probability of amount, level `Percentage difference of 'amount' within 10.00%`
Iteration 4: Largest change in params was 0.0022 in the m_probability of transaction_date, level `<=30 days`
Iteration 5: Largest change in params was 0.000385 in the m_probability of transaction_date, level `<=30 days`
Iteration 6: Largest change in params was -0.000255 in the m_probability of amount, level `All other comparisons`
Iteration 7: Largest change in params was -0.000229 in the m_probability of amount, level `All other comparisons`
Iteration 8: Largest change in params was -0.000208 in the m_probability of amount, level `All other comparisons`
Iteration 9: Largest change in params was -0.00019 in the m_probability of amount, level `All other comparisons`
Iteration 10: Largest change in params was -0.000173 in the m_probability of amount, level `All other comparisons`
Iteration 11: Largest change in params was -0.000159 in the m_probability of amount, level `All other comparisons`
Iteration 12: Largest change in params was -0.000146 in the m_probability of amount, level `All other comparisons`
Iteration 13: Largest change in params was -0.000135 in the m_probability of amount, level `All other comparisons`
Iteration 14: Largest change in params was -0.000124 in the m_probability of amount, level `All other comparisons`
Iteration 15: Largest change in params was -0.000115 in the m_probability of amount, level `All other comparisons`
Iteration 16: Largest change in params was -0.000107 in the m_probability of amount, level `All other comparisons`
Iteration 17: Largest change in params was -9.92e-05 in the m_probability of amount, level `All other comparisons`

EM converged after 17 iterations

Your model is not yet fully trained. Missing estimates for:
    - memo (no m values are trained).





<EMTrainingSession, blocking on l."memo" = r."memo", deactivating comparisons memo>

session = linker.training.estimate_parameters_using_expectation_maximisation(block_on("amount"))

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."amount" = r."amount"

Parameter estimates will be made for the following comparison(s):
    - memo
    - transaction_date

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - amount

Iteration 1: Largest change in params was -0.373 in the m_probability of memo, level `Exact match on memo`
Iteration 2: Largest change in params was -0.108 in the m_probability of memo, level `Exact match on memo`
Iteration 3: Largest change in params was 0.0202 in the m_probability of memo, level `Levenshtein distance of memo <= 10`
Iteration 4: Largest change in params was -0.00538 in the m_probability of memo, level `Exact match on memo`
Iteration 5: Largest change in params was 0.00482 in the m_probability of memo, level `All other comparisons`
Iteration 6: Largest change in params was 0.00508 in the m_probability of memo, level `All other comparisons`
Iteration 7: Largest change in params was 0.00502 in the m_probability of memo, level `All other comparisons`
Iteration 8: Largest change in params was 0.00466 in the m_probability of memo, level `All other comparisons`
Iteration 9: Largest change in params was 0.00409 in the m_probability of memo, level `All other comparisons`
Iteration 10: Largest change in params was 0.00343 in the m_probability of memo, level `All other comparisons`
Iteration 11: Largest change in params was 0.00276 in the m_probability of memo, level `All other comparisons`
Iteration 12: Largest change in params was 0.00216 in the m_probability of memo, level `All other comparisons`
Iteration 13: Largest change in params was 0.00165 in the m_probability of memo, level `All other comparisons`
Iteration 14: Largest change in params was 0.00124 in the m_probability of memo, level `All other comparisons`
Iteration 15: Largest change in params was 0.000915 in the m_probability of memo, level `All other comparisons`
Iteration 16: Largest change in params was 0.000671 in the m_probability of memo, level `All other comparisons`
Iteration 17: Largest change in params was 0.000488 in the m_probability of memo, level `All other comparisons`
Iteration 18: Largest change in params was 0.000353 in the m_probability of memo, level `All other comparisons`
Iteration 19: Largest change in params was 0.000255 in the m_probability of memo, level `All other comparisons`
Iteration 20: Largest change in params was 0.000183 in the m_probability of memo, level `All other comparisons`
Iteration 21: Largest change in params was 0.000132 in the m_probability of memo, level `All other comparisons`
Iteration 22: Largest change in params was 9.45e-05 in the m_probability of memo, level `All other comparisons`

EM converged after 22 iterations

Your model is fully trained. All comparisons have at least one estimate for their m and u values

linker.visualisations.match_weights_chart()

df_predict = linker.inference.predict(threshold_match_probability=0.001)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

linker.visualisations.comparison_viewer_dashboard(
    df_predict, "dashboards/comparison_viewer_transactions.html", overwrite=True
)
from IPython.display import IFrame

IFrame(
    src="./dashboards/comparison_viewer_transactions.html", width="100%", height=1200
)

pred_errors = linker.evaluation.prediction_errors_from_labels_column(
    "ground_truth", include_false_positives=True, include_false_negatives=False
)
linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

pred_errors = linker.evaluation.prediction_errors_from_labels_column(
    "ground_truth", include_false_positives=False, include_false_negatives=True
)
linker.visualisations.waterfall_chart(pred_errors.as_record_dict(limit=5))