Comparison playground

# Uncomment and run this cell if you're running in Google Colab.
# !pip install ipywidgets
# !pip install splink
# !jupyter nbextension enable --py widgetsnbextension

Comparison values¶

Run the following cell to get an interactive interface for trying out comparisons from the comparison_library.

The interface uses the default arguments for each comparison, and can only be used for comparisons that refer to a single column

In the following cells, you can get similar results for custom comparisons with an arbitrary number of input columns.

import splink.comparison_library as cl
from splink.internals.testing import comparison_vector_value, is_in_level
from splink import DuckDBAPI
import ipywidgets as widgets
from IPython.display import Markdown


def report_comparison_levels(comparison, values_dict, column_name):
    db_api = DuckDBAPI()
    levels = comparison.create_comparison_levels()
    comparison_dict = comparison.get_comparison("duckdb").as_dict()

    table_rows = []
    table_rows.append("| Match | Level | Description | SQL Condition |")
    table_rows.append("|-------|-------|-------------|---------------|")

    total_levels = len(levels)
    matched_level = None
    for i, level in enumerate(levels):
        level_number = total_levels - i - 1
        label = level.create_label_for_charts()
        sql_condition = comparison_dict['comparison_levels'][i]['sql_condition']

        is_match = is_in_level(level, values_dict, db_api)

        if is_match and matched_level is None:
            matched_level = level_number
            match_indicator = "✓"
            row = f"| {match_indicator} | **{level_number}** | **{label}** | `{sql_condition}` |"
        else:
            match_indicator = ""
            row = f"| {match_indicator} | {level_number} | {label} | `{sql_condition}` |"

        table_rows.append(row)

    markdown_output = "\n".join(table_rows)
    return markdown_output

def create_comparison_playground(column_name):

    comparison_types = [
        'ExactMatch', 'LevenshteinAtThresholds', 'JaroAtThresholds',
        'JaroWinklerAtThresholds', 'DamerauLevenshteinAtThresholds',
        'JaccardAtThresholds',
        'AbsoluteDateDifferenceAtThresholds',
        'ArrayIntersectAtSizes', 'DateOfBirthComparison',
         'EmailComparison',
        'NameComparison', 'PostcodeComparison'
    ]

    default_values = {
        'ExactMatch': ('john', 'jon'),
        'LevenshteinAtThresholds': ('smith', 'smyth'),
        'JaroAtThresholds': ('martha', 'matha'),
        'JaroWinklerAtThresholds': ('williams', 'willaims'),
        'DamerauLevenshteinAtThresholds': ('receive', 'recieve'),
        'CosineSimilarityAtThresholds': ('data science', 'science data'),
        'JaccardAtThresholds': ('0123456789', '012345678'),
        'AbsoluteDateDifferenceAtThresholds': ('2023-01-01', '2023-01-15'),
        'ArrayIntersectAtSizes': ('apple,banana,cherry', 'banana,cherry,date'),
        'DateOfBirthComparison': ('1990-05-15', '1990-05-16'),
        'EmailComparison': ('john.doe@example.com', 'john.doe@other.com'),
        'NameComparison': ('Elizabeth', 'Elisabeth'),
        'PostcodeComparison': ('SW1A 1AA', 'SW1A 1AB')
    }

    db_api = DuckDBAPI()

    docstrings = {}
    for comp_type in comparison_types:
        class_obj = getattr(cl, comp_type)
        init_doc = getattr(class_obj.__init__, '__doc__', None)
        docstrings[comp_type] = init_doc if init_doc else class_obj.__doc__


    def get_comparison(comp_type):
        if comp_type in ['DateOfBirthComparison']:

            return getattr(cl, comp_type)(column_name, input_is_string=True)
        if comp_type == 'AbsoluteDateDifferenceAtThresholds':
            return getattr(cl, comp_type)(column_name, input_is_string=True, metrics=["day", "month"], thresholds=[1, 1])
        elif comp_type in ['EmailComparison', 'ForenameSurnameComparison', 'NameComparison', 'PostcodeComparison', 'ArrayIntersectAtSizes']:
            return getattr(cl, comp_type)(column_name)
        else:
            return getattr(cl, comp_type)(column_name)

    def run_comparison(change):
        left_value = left_input.value if left_input.value != "" else None
        right_value = right_input.value if right_input.value != "" else None
        comparison = get_comparison(comparison_select.value)

        if comparison_select.value == 'ArrayIntersectAtSizes':
            left_value = left_value.split(',') if left_value else None
            right_value = right_value.split(',') if right_value else None

        values_dict = {f"{column_name}_l": left_value, f"{column_name}_r": right_value}

        output.clear_output()
        markdown_output = report_comparison_levels(comparison, values_dict, column_name)
        with output:
            display(Markdown("### Comparison levels:"))
            display(Markdown(markdown_output))

            docstring = docstrings.get(comparison_select.value, "No docstring available")
            processed_docstring = "\n".join(line.strip() for line in docstring.split("\n"))
            display(Markdown("### Comparison Function Docstring:"))
            display(Markdown(processed_docstring))


        # Store the markdown output for later use
        playground.markdown_output = markdown_output



    def on_comparison_change(change):
        new_value = change['new']
        left_value, right_value = default_values.get(new_value, ('', ''))

        # Temporarily unobserve the input widgets
        left_input.unobserve(run_comparison, names='value')
        right_input.unobserve(run_comparison, names='value')

        # Update the values
        left_input.value = left_value
        right_input.value = right_value

        # Re-observe the input widgets
        left_input.observe(run_comparison, names='value')
        right_input.observe(run_comparison, names='value')

        # Run the comparison once after updating both inputs
        run_comparison(None)


    comparison_select = widgets.Dropdown(
        options=comparison_types,
        value='ExactMatch',
        description='Comparison:',
    )
    left_input = widgets.Text(description=f"{column_name} Left:", value=default_values['ExactMatch'][0])
    right_input = widgets.Text(description=f"{column_name} Right:", value=default_values['ExactMatch'][1])
    output = widgets.Output()

    comparison_select.observe(on_comparison_change, names='value')
    for widget in (comparison_select, left_input, right_input):
        widget.observe(run_comparison, names='value')

    # Call run_comparison immediately to compute initial output
    playground = widgets.VBox([comparison_select, left_input, right_input, output])

    run_comparison(None)

    return playground

playground = create_comparison_playground("column")
display(playground)

VBox(children=(Dropdown(description='Comparison:', options=('ExactMatch', 'LevenshteinAtThresholds', 'JaroAtTh…

import splink.comparison_level_library as cll

first_name_comparison = cl.CustomComparison(
    comparison_levels=[
        cll.NullLevel("first_name"),
        cll.ExactMatchLevel("first_name"),
        {
            "sql_condition": "first_name_l = surname_r",
            "label_for_charts": "Match on reversed cols: first_name and surname",
        },
        cll.JaroWinklerLevel("first_name", 0.8),
        cll.ElseLevel(),
    ]
)
# Need to be able to pass values in as a dict {"first_name_l": "Robin", "first_name_r": "Robyn", "surname_l": "Linacre", "surname_r": "Linacre"}
values = {
    "first_name_l": "Robin",
    "first_name_r": "Linacre",
    "surname_l": "Linacre",
    "surname_r": "Robin",
}
display(Markdown(report_comparison_levels(first_name_comparison, values, "first_name")))

Match	Level	Description	SQL Condition
	4	first_name is NULL	`"first_name_l" IS NULL OR "first_name_r" IS NULL`
	3	Exact match on first_name	`"first_name_l" = "first_name_r"`
✓	2	Match on reversed cols: first_name and surname	`first_name_l = surname_r`
	1	Jaro-Winkler distance of first_name >= 0.8	`jaro_winkler_similarity("first_name_l", "first_name_r") >= 0.8`
	0	All other comparisons	`ELSE`