Source code for meshed.tests.test_getitem

import pytest


from collections import Counter
from meshed import FuncNode
from meshed.dag import DAG
from pytest import fixture


def X_test(train_test_split):
    return train_test_split[1]


def y_test(train_test_split):
    return train_test_split[3]


def truth(y_test):  # to link up truth and test_y
    return y_test



[docs]
def confusion_count(prediction, truth):
    """Get a dict containing the counts of all combinations of predicction and corresponding truth values."""
    return Counter(zip(prediction, truth))




[docs]
def prediction(predict_proba, threshold):
    """Get an array of predictions from thresholding the scores of predict_proba array."""
    return list(map(lambda x: x >= threshold, predict_proba))




[docs]
def predict_proba(model, X_test):
    """Get the prediction_proba scores of a model given some test data"""
    return model.predict_proba(X_test)



def _aligned_items(a, b):
    """Yield (k, a_value, b_value) triples for all k that are both a key of a and of b"""
    # reason for casting to dict is to make sure things like pd.Series use the right keys.
    # could also use k in a.keys() etc. to solve this.
    a = dict(a)
    b = dict(b)
    for k in a:
        if k in b:
            yield k, a[k], b[k]



[docs]
def dot_product(a, b):
    """
    >>> dot_product({'a': 1, 'b': 2, 'c': 3}, {'b': 4, 'c': -1, 'd': 'whatever'})
    5
    """
    return sum(ak * bk for _, ak, bk in _aligned_items(a, b))




[docs]
def classifier_score(confusion_count, confusion_value):
    """Compute a score for a classifier that produced the `confusion_count`, based on the given `confusion_value`.
    Meant to be curried by fixing the confusion_value dict.

    The function is purposely general -- it is not specific to binary classifier outcomes, or even any classifier outcomes.
    It simply computes a normalized dot product, depending on the inputs keys to align values to multiply and
    considering a missing key as an expression of a null value.
    """
    return dot_product(confusion_count, confusion_value) / sum(confusion_count.values())



@fixture
def bigger_dag():
    bigger_dag = DAG(
        [
            classifier_score,
            confusion_count,
            prediction,
            predict_proba,
            X_test,
            y_test,
            truth,
        ]
    )
    return bigger_dag


def test_full_subgraph(bigger_dag):
    result = bigger_dag[['truth', 'prediction']:'confusion_count']
    expected = 'DAG(func_nodes=[FuncNode(prediction,truth -> confusion_count_ -> confusion_count)], name=None)'
    assert result.__repr__() == expected