SSVC CSV Analyzer

This module provides a script for analyzing an SSVC tree csv file.

usage: csv_analyzer.py [-h] [--outcol OUTCOL] [--permutation] csvfile

Analyze an SSVC tree csv file

positional arguments:
  csvfile          the csv file to analyze

options:
  -h, --help       show this help message and exit
  --outcol OUTCOL  the name of the outcome column
  --permutation    use permutation importance instead of drop column importance

Example

Given a test.csv file like this:

row,Exploitation,Exposure,Automatable,Human Impact,Priority
1,none,small,no,low,defer
2,none,small,no,medium,defer
3,none,small,no,high,scheduled
...

Analyze the csv file:

$ python csv_analyzer.py test.csv

Feature Importance after Dropping Each Feature in test.csv
         feature  feature_importance
0  exploitation_            0.347222
1  human_impact_            0.291667
2   automatable_            0.180556
3      exposure_            0.166667

Higher values imply more important features.

`_clean_table(df)`

Clean up a dataframe, normalizing column names and dropping columns we don't need

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	the dataframe to clean	required

Returns:

Type	Description
`DataFrame`	the cleaned dataframe

Source code in src/ssvc/csv_analyzer.py

def _clean_table(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean up a dataframe, normalizing column names and dropping columns we don't need

    Args:
        df: the dataframe to clean

    Returns:
        the cleaned dataframe
    """
    # normalize data
    df = df.rename(columns=_col_norm)
    # drop columns we don't need
    drop_cols = [
        "row",
    ]
    df = df.drop(columns=drop_cols, errors="ignore")
    return df

`_col_norm(c)`

Normalize a column name

Parameters:

Name	Type	Description	Default
`c`	`str`	the column name to normalize	required

Returns:

Type	Description
`str`	the normalized column name

Source code in src/ssvc/csv_analyzer.py

def _col_norm(c: str) -> str:
    """
    Normalize a column name

    Args:
        c: the column name to normalize

    Returns:
        the normalized column name
    """
    new_col = re.sub("[^0-9a-zA-Z]+", "_", c)
    new_col = new_col.lower()
    return new_col

`_imp_df(column_names, importances)`

Create a dataframe of feature importances

Parameters:

Name	Type	Description	Default
`column_names`	`list`	the names of the columns	required
`importances`	`list`	the feature importances	required

Returns:

Type	Description
`DataFrame`	a dataframe of feature importances

Source code in src/ssvc/csv_analyzer.py

def _imp_df(column_names: list, importances: list) -> pd.DataFrame:
    """
    Create a dataframe of feature importances

    Args:
        column_names: the names of the columns
        importances: the feature importances

    Returns:
        a dataframe of feature importances
    """
    df = (
        pd.DataFrame(
            {"feature": column_names, "feature_importance": importances}
        )
        .sort_values("feature_importance", ascending=False)
        .reset_index(drop=True)
    )
    return df

`_prepare_data(df, target, permute=False)`

Compute feature importance two different ways for a dataframe

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	the dataframe to analyze	required
`target`	`str`	the name of the target column to analyze against	required
`permute`	`bool`	use permutation importance instead of drop column importance	`False`

Returns:

Type	Description
`(DataFrame, DataFrame)`	a tuple of (the cleaned dataframe, the feature importance dataframe)

Source code in src/ssvc/csv_analyzer.py

def _prepare_data(
    df: pd.DataFrame, target: str, permute: bool = False
) -> (pd.DataFrame, pd.DataFrame):
    """
    Compute feature importance two different ways for a dataframe

    Args:
        df: the dataframe to analyze
        target: the name of the target column to analyze against
        permute: use permutation importance instead of drop column importance

    Returns:
        a tuple of (the cleaned dataframe, the feature importance dataframe)
    """

    df = _clean_table(df)
    # check for target column
    if target not in df.columns:
        raise KeyError(f"Column '{target}' not found in {list(df.columns)}")

    X, y = _split_data(df, target)
    # turn features into ordinals
    # this assumes that every column is an ordinal label
    # and that the ordinals are sorted in ascending order
    cols = []
    for c in X.columns:
        newcol = f"{c}_"
        cols.append(newcol)
        codes = list(enumerate(X[c].unique()))
        mapper = {v: k for (k, v) in codes}
        X[newcol] = X[c].replace(mapper)
    X2 = X[cols]

    return X2, y

`_split_data(df, target)`

Split a dataframe into features and target

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	the dataframe to split	required
`target`	`str`	the name of the target column	required

Returns:

Type	Description
`(DataFrame, DataFrame)`	a tuple of (features, target)

Source code in src/ssvc/csv_analyzer.py

def _split_data(df: pd.DataFrame, target: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Split a dataframe into features and target

    Args:
        df: the dataframe to split
        target: the name of the target column

    Returns:
        a tuple of (features, target)
    """

    # construct feature list
    features = [c for c in df.columns if c != target]
    y = df[target]
    X = df[features]
    return X, y

`drop_col_feature_importance(df, target)`

Compute feature importance using drop column feature importance

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	the dataframe to analyze	required
`target`	`str`	the name of the target column to analyze against	required

Returns:

Type	Description
`DataFrame`	a dataframe of feature importances

Source code in src/ssvc/csv_analyzer.py

def drop_col_feature_importance(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """
    Compute feature importance using drop column feature importance

    Args:
        df: the dataframe to analyze
        target: the name of the target column to analyze against

    Returns:
        a dataframe of feature importances
    """
    X2, y = _prepare_data(df, target)
    # construct tree
    dt = DecisionTreeClassifier(random_state=99, criterion="entropy")

    imp = _drop_col_feat_imp(dt, X2, y)
    return imp

`permute_feature_importance(df, target)`

Compute feature importance using permutation feature importance

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	the dataframe to analyze	required
`target`	`str`	the name of the target column to analyze against	required

Returns:

Type	Description
`DataFrame`	a dataframe of feature importances

Source code in src/ssvc/csv_analyzer.py

def permute_feature_importance(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """
    Compute feature importance using permutation feature importance

    Args:
        df: the dataframe to analyze
        target: the name of the target column to analyze against

    Returns:
        a dataframe of feature importances
    """
    X2, y = _prepare_data(df, target)
    # construct tree
    dt = DecisionTreeClassifier(random_state=99, criterion="entropy")

    imp = _perm_feat_imp(dt, X2, y)
    return imp