Source code for cvanmf.reapply

#!/usr/bin/env python
"""
Reapply existing Enterosignature models to new abundance data.

The easiest way to do this is through the :func:`reapply` function, which is
most flexible about parameter types. The other functions perform individual
steps, which are useful if you want fine control of a given step, but
probably not necessary for most uses.
"""
import logging
import pathlib
import re
from typing import (Collection, Dict, Iterator, List,
                    Optional, Set, Tuple, Union, Protocol)

import click
import numpy as np
import pandas as pd
from sklearn.decomposition import non_negative_factorization

from cvanmf.models import Signatures

logger: logging.Logger = logging.getLogger(__name__)

# Compile regular expressions
RE_RANK: re.Pattern = re.compile("[a-zA-Z]__")
RE_SPLIT_GENUS: re.Pattern = re.compile(r"([\w]+)_\w$")
# Match the last named rank, plus any following ?s
RE_SHORTEN: re.Pattern = re.compile(r".*;([^\;?]+[;\?]*)$")


class CVANMFException(Exception):
    """cvanmf specific exception."""



[docs]
class FeatureMapping:
    """Connect new data table features to those in the model.

    Manage the mappings from input features to model features. Source features
    are the features in the new abundance table we want to fit to the model;
    target features are the features in the model we're trying to match to.
    User defined mappings can be provided via `hard_map`, any subsequent
    mappings for a source taxon in `hard_map` will be ignored. New mappings are
    added via :meth:`add`. When mappings are fully defined the model w matrix
    and the new data table can be matched using :meth:`transform_w` and
    :meth:`transform_abundance`
    """

    def __init__(self,
                 target_features: Set[str],
                 source_features: Set[str],
                 hard_map: Optional[Dict[str, str]] = None):
        """
        :param target_features: Model features to map to
        :type target_features: Set[str]
        :param source_features: Input features to be mapped from
        :type source_features: Set[str]
        :param hard_map: User defined mappings, as a dictionary with source
            as key and target as value.
        :type hard_map: Optional[Dict[str, str]]
        """

        # hard_map are user provided mappings which will never be altered
        self.__target_features = target_features
        self.__source_features = source_features
        self.__hard_map = {} if hard_map is None else hard_map
        self.__map: Dict[str, List[str]] = dict()


[docs]
    def add(self, feature_from: str, feature_to: str) -> None:
        """Add a mapping. If there is already a mapping from this feature, we
        will append this one. Use :meth:`conflicts` to identify where more than
        one mapping exists.

        :param feature_from: Feature in the new table
        :type feature_from: str
        :param feature_to: Model feature to map to
        :type feature_to: str
        :raises EnteroException: Feature not in the relevant sets
        """
        # If user has provided a hard mapping, ignore this mapping
        if feature_from in self.__hard_map:
            return
        # If the genus being mapped to is not valid, raise an exception
        if feature_to not in self.__target_features:
            raise CVANMFException(
                f"Mapping of '{feature_from}' to target '{feature_to}' not "
                f"possible as target does not exist.")
        if feature_from not in self.__source_features:
            raise CVANMFException(
                f"Mapping of '{feature_from}' to target '{feature_to}' not "
                f"possible as source does not exist.")
        if feature_from not in self.__map:
            self.__map[feature_from] = [feature_to]
        else:
            self.__map[feature_from].append(feature_to)



[docs]
    def to_df(self) -> pd.DataFrame:
        """Produce a dataframe of the mapping. Where mappings are amibiguous,
        multiple rows will be included. Where mappings are missing, one row with
        a blank target will be included.

        :return: DataFrame with two columns, first source feature, second target
            feature.
        :rtype: pd.DataFrame
        """

        # Could be more nicely done with some maps, but take a simple approach
        df_src: List[Tuple[str, str]] = []
        for source, targets in self.mapping.items():
            for target in targets:
                df_src.append((source, target))
        missing: List[Tuple[str, str]] = list(
            zip(self.missing(), [''] * len(self.missing())))
        df_src = df_src + missing
        df: pd.DataFrame = pd.DataFrame(
            df_src, columns=['input_feature', 'model_feature'])
        return df



[docs]
    def missing(self) -> Collection[str]:
        """Identify input features which currently have no mapping.

        :return: Source features which are not mapping to any model feature
        :rtype: Collection[str]
        """
        return self.__source_features.difference(set(self.mapping.keys()))



[docs]
    def transform_abundance(self, abd_tbl: pd.DataFrame) -> pd.DataFrame:
        """Applying mapping to the input table.

        Make a table with renamed and combined rows based on the
        identified mappings.

        :param abd_tbl: New table, samples on columns
        :type abd_tbl: pd.DataFrame
        :return: Table with mappings applied
        :rtype: pd.DataFrame
        """

        # Probably we could do some smarter changes and groupbys
        # TODO(apduncan): Refactor, slow and ugly
        abd: pd.DataFrame = abd_tbl.copy()
        for input_feature, feature_maps in self.mapping.items():
            # Where there are conflicting mappings, use the first one
            feature_map = feature_maps[0]
            if input_feature == feature_map:
                continue
            # Add or sum
            if feature_map not in abd.index:
                abd.loc[feature_map] = abd.loc[input_feature]
            else:
                abd.loc[feature_map] += abd.loc[input_feature]
            abd = abd.drop(labels=[input_feature])
        return abd



[docs]
    def transform_w(self,
                    w: pd.DataFrame,
                    abd_tbl: pd.DataFrame) -> pd.DataFrame:
        """Match the model w matrix to the new table.

        Make a W matrix which has features not in the abundance table removed,
        and rows added for features which are in the abundance table but not the
        model.

        :param w: Model W matrix
        :type w: pd.DataFrame
        :param abd_tbl: New matrix. Should `not` have been transformed
            with :meth:`transform_abundance`.
        :type abd_tbl: pd.DataFrame
        :return: W matrix matched to new table
        :rtype: pd.DataFrame
        """
        w_new: pd.DataFrame = w.copy()
        # Drop taxa which are not in the input matrix from the W matrix
        w_feat: Set[str] = set(w_new.index)
        w_new = w_new.drop(labels=w_feat.difference(set(abd_tbl.index)))
        # Add taxa which are in the input but not in the W matrix
        missing_features: Set[str] = set(abd_tbl.index).difference(w_feat)
        missing_df: pd.DataFrame = pd.DataFrame(
            np.zeros(shape=[len(missing_features), w.shape[1]]),
            index=list(missing_features),
            columns=w.columns
        )
        w_new = pd.concat([w_new, missing_df])
        return w_new


    @property
    def conflicts(self) -> List[Tuple[str, List[str]]]:
        """Features for which more than one target exists."""
        return list(filter(lambda x: len(x[1]) > 1, self.mapping.items()))

    @property
    def mapping(self) -> Dict[str, List[str]]:
        """Mapping from source to target features."""
        return {**self.__map, **{x: [y] for x, y in self.__hard_map.items()}}



def _is_taxon_unknown(taxon: str) -> bool:
    # Remove ?, if all entries in lineage blank, the is unknown
    return all(
        map(
            lambda x: len(x.strip()) == 0,
            taxon.replace("?", "").split(';')
        )
    )


def _contain_rank_indicators(taxon: str) -> bool:
    # Check if this taxonomy contains rank indicators
    return any(map(
        lambda x: RE_RANK.search(x.strip()) is not None,
        taxon.split(';')
    ))


def _shorten_genus(
        genus: str,
        genera_list: Collection[str],
        short_genera_list: Collection[str]
) -> str:
    # Copying the operation of Clemence's shorten_genus function
    # Get last named rank, and subsequent ?
    # Genera list should the genera list, but with RE_SHORTEN applied, not 
    # applying each call as computationally inefficient (especially if we want 
    # to run on streamlit's servers)
    # Count occurences in the genera list
    short: str = RE_SHORTEN.sub(r"\1", genus)
    nb_occ: int = sum(map(lambda x: short in x, genera_list))
    nb_occ_s: int = sum(map(lambda x: short in x, short_genera_list))
    # shorten2 is the last three bits of the lineage, if we observed short in 
    # the short form genera list more than once, else we just keep short
    shorten2: str = ";".join(genus.split(';')[-3:]) if nb_occ_s > 1 else short
    final_shorten: str = short
    if nb_occ != 1:
        if nb_occ_s == 1:
            # Extract the final 2
            final_shorten = ";".join(genus.split(";")[-2:])
        else:
            final_shorten = shorten2
    return final_shorten


def _final_rank(taxon: str) -> str:
    # Exctract final non-blank or ? rank
    return next(filter(
        lambda x: len(x.strip()) != 0,
        reversed(taxon.replace("?", "").split(';'))
    ))


def _final_rank_equal(a: str, b: str) -> bool:
    return _final_rank(a) == _final_rank(b)


def _pad_lineage(
        taxon: str,
        delim: str = ";",
        length: int = 6,
        unknown: str = "") -> str:
    """Pad lineage out to the expected length using delimiters. Default is to
    genus level.

    :param str taxon: Lineage string for taxon
    :param str delim: Delimiter between ranks
    :param int length: Rank to truncate to
    :param str unknown: Text to fill in unknown ranks
    """

    lineage_parts: List[str] = taxon.split(delim)
    diff: int = length - len(lineage_parts)
    if diff > 0:
        lineage_parts += [unknown] * diff
    lineage_parts = lineage_parts[0:length]
    # Replace any length 0 entries with the unknown indicator
    lineage_parts = list(map(
        lambda x: x if len(x) > 0 else unknown,
        lineage_parts
    ))
    return delim.join(lineage_parts)



[docs]
def validate_genus_table(abd_tbl: pd.DataFrame,
                         **kwargs
                         ) -> pd.DataFrame:
    """Basic checks and transformations of the abundance table. 
    
    Some transformations may be made here, such as transposition. Any 
    transformation will be written out to inform the user. Transformations are 
    done in place.
    
    :param abd_tbl: Abundance table to check
    :type abd_tbl: pd.DataFrame
    :param logger: Function to report errors
    :type logger: Callable[[str], None]
    :returns: Validated, potentially transformed, dataframe
    :rtype: pd.DataFrame
    """

    # Check the dimensions make sense
    if abd_tbl.shape[0] < 2 or abd_tbl.shape[1] < 2:
        logger.critical(
            "Table has one or fewer columns or rows. Check delimiters and "
            "newline formats are correct.")
        raise CVANMFException("Table incorrect format.")

    # Check that there were column names in the file. We're going to assume
    # that if column names are all numeric values, there were no headers.
    # Unfortunate for people who only gave their samples numbers, but alas.
    all_numeric: bool = all(map(lambda x: str(x).isnumeric(), abd_tbl.columns))
    if all_numeric:
        logger.error(
            "Table appear to lack sample IDs in the first row. Add sample "
            "IDs, or ensure all sample IDs are not numeric.")
        raise CVANMFException("Table lacks sample IDs in first row.")

    # Check that taxa are on rows. We'll do that by looking for "Bacteria;"
    # in the column names.
    count_bac: int = len(
        list(filter(lambda x: "BACTERIA;" in x.upper(), abd_tbl.columns)))
    if (count_bac / len(abd_tbl.columns)) > 0.2:
        logger.warning("Table appears to have taxa on columns, so we have "
                       "transposed it.""")
        abd_tbl = abd_tbl.T

    # Remove ? from lineages, unknown should be blank
    # abd_tbl.index = list(map(lambda x: x.replace("?", ""), abd_tbl.index))

    # See if there are rank indicators in the taxa names
    # We'll take a look at the first 10 taxa, and see if they do
    rank_indicated: bool = all(map(_contain_rank_indicators,
                                   abd_tbl.index[:10]))
    if rank_indicated:
        logger.info(
            "Taxa names appear to contain rank indicators (i.e k__, p__), "
            "these have been removed to match Enterosignature format.""")
        abd_tbl.index = map(
            lambda x: re.sub(RE_RANK, "", x),
            abd_tbl.index
        )

    # Delete any taxa which have numeric labels (basically dealing with
    # MATAFILER output which has a -1 row included). More useful for our
    # group, limited use for other maybe.
    numeric_taxa: List[str] = list(filter(
        lambda x: str(x).lstrip('-').isnumeric(), abd_tbl.index))
    if len(numeric_taxa) > 0:
        logger.info(
            f"{len(numeric_taxa)} taxa had numeric labels and were dropped.")
        abd_tbl = abd_tbl.drop(labels=numeric_taxa)

    # Pad lineages to standard length with semicolons
    new_index: pd.Index = pd.Index(map(_pad_lineage, abd_tbl.index))
    # If there are duplicated rows (same genus, or malformed strings) then
    # report an error
    duplicates: np.ndarray = new_index.duplicated(keep=False)
    if any(duplicates):
        orig_dups: List[str] = abd_tbl.index[duplicates]
        new_dups: List[str] = new_index[duplicates]
        logger.error(
            f"{len(orig_dups)} taxa are duplicates after trimming to "
            "genus length. This may be genuine duplicates, or could be "
            "due to erroneous semi-colons. Duplicates are:")
        for o, n in zip(orig_dups, new_dups):
            logger.error(f'{o} -> {n}')
        raise CVANMFException("Duplicate taxa in input after lineage " +
                              "truncated.")

    # Remove any taxa which are completely unknown (all ? or blank)
    bad_taxa: List = list(filter(_is_taxon_unknown, abd_tbl.index))
    abd_tbl = abd_tbl.drop(labels=bad_taxa)
    if len(bad_taxa) > 0:
        logger.info(f"Removed {len(bad_taxa)} unknown taxa: {bad_taxa}")

    # Remove any taxa which had 0 observations, and samples for which 0
    # taxa were observed (unlikely but check)
    zero_taxa: List = list(abd_tbl.loc[abd_tbl.sum(axis=1) == 0].index)
    zero_samples: List = list(
        abd_tbl.loc[:, abd_tbl.sum(axis=0) == 0].columns)
    if len(zero_taxa) > 0:
        abd_tbl = abd_tbl.drop(labels=zero_taxa)
        logger.info(
            f"Dropped {len(zero_taxa)} taxa with no observations: {zero_taxa}")
    if len(zero_samples) > 0:
        abd_tbl = abd_tbl.drop(columns=zero_samples)
        logger.info(
            f"Dropped {len(zero_samples)} sample with no observations")

    # Renormalise (TSS)
    abd_tbl = abd_tbl / abd_tbl.sum(axis=0)

    return abd_tbl




[docs]
def match_genera(
        w: pd.DataFrame,
        y: pd.DataFrame,
        hard_mapping: Optional[Dict[str, str]] = None,
        family_rollup: bool = True,
        **kwargs
) -> FeatureMapping:
    """Match taxonomic names in the input table and the Enterosignatures W 
    matrix.
    
    This function is currently based on the R script provided by Clemence in
    the Enterosignatures (ES) gitlab repo (prepare_matrices.R)
    https://gitlab.inria.fr/cfrioux/enterosignature-paper/. 
    This will attempt to match names. Mappings in the ``hard_mapping`` parameter
    are new names to ES names, and will be applied before any other matches
    identified.
    
    :param w: Enterosignatures W matrix
    :type w: pd.DataFrame
    :param y: Abundance table being transformed
    :type y: pd.DataFrame
    :param hard_mapping: Mapping from input to ES name
    :type hard_mapping: Dict[str, str]
    :param family_rollup: Move abundance of genera which are not matched to
        the family level entry if one exists
    :type family_rollup: bool
    :param logger: Function to log messages
    :type logger: Callable[[Any], None]
    :returns: Transformed abundance table, es W matrix, and mapping object
    :rtype: Tuple[pd.DataFrame, pd.DataFrame, List[str]]
    """

    # Set optional parameter values
    if hard_mapping is None:
        hard_mapping = dict()
    # Sets of taxa
    es_taxa: Set[str] = set(w.index)
    input_taxa: Set[str] = set(y.index)
    unmatched: Set[str] = set(y.index)

    # Create an object to collect all the mappings we identify
    # No tables will be altered until we've exhausted the ways in which we
    # could generate mappings
    mapping: FeatureMapping = FeatureMapping(
        hard_map=hard_mapping,
        source_features=input_taxa,
        target_features=es_taxa)

    # Add any exact matches. If there was an exact match, remove from unmatched
    # pool
    exact = es_taxa.intersection(input_taxa)
    logger.info(
        f"{len(exact)} of {len(input_taxa)} taxa names matched exactly")
    for genus in exact:
        mapping.add(genus, genus)
    unmatched = unmatched.difference(exact)

    # Merge some split genera (ie Clostridia_A -> Clostridia)
    trimmed: Set[str] = set()
    for taxon in filter(lambda x: "_" in x.split(';')[-1], unmatched):
        root: str = re.sub(RE_SPLIT_GENUS, r"\1", taxon)
        if root in es_taxa:
            trimmed.add(taxon)
            mapping.add(taxon, root)
    logger.info(
        f"{len(trimmed)} genera trimmed and matched (i.e Ruminococcus_C ->"
        " Ruminococcus)")
    unmatched = unmatched.difference(trimmed)

    # Homogenise taxa of input matrix and ES
    # This is adapted directly from Clemence's script, and identifies taxa
    # where the lowest known rank matches, but the preceding elements do not
    es_taxa_short: List[str] = list(map(
        lambda x: RE_SHORTEN.sub(r"\1", x), es_taxa))
    input_taxa = set(unmatched)
    input_taxa_short: List[str] = list(map(
        lambda x: RE_SHORTEN.sub(r"\1", x), input_taxa))
    es_only: Set[str] = es_taxa.difference(input_taxa)
    input_only: Set[str] = input_taxa.difference(es_taxa)

    es_only_genus: Set[str] = set(
        map(lambda x: _shorten_genus(
            x,
            genera_list=es_taxa,
            short_genera_list=es_taxa_short
        ),
            es_only
            )
    )
    input_only_genus: Set[str] = set(
        map(lambda x: _shorten_genus(
            x,
            genera_list=input_taxa,
            short_genera_list=input_taxa_short
        ),
            input_only
            )
    )

    # Which can be merged based on Clemence's critera
    fixable_taxa: Set[str] = es_only_genus.intersection(input_only_genus)
    clemence_merged: Set[str] = set()
    for taxon in fixable_taxa:
        t_from: List[str] = list(filter(lambda x: taxon in x, unmatched))
        t_to: List[str] = list(filter(lambda x: taxon in x, w))
        for f in t_from:
            for t in t_to:
                mapping.add(f, t)
            clemence_merged.add(f)
    logger.info(f"{len(clemence_merged)} taxa merged by final rank (e.g. "
                 "UBA1435, CAG-314)")
    unmatched = unmatched.difference(clemence_merged)

    # Final rank matching
    final_matched: Set[str] = set()
    for taxon in unmatched:
        final_match: Iterator[str] = filter(
            lambda x: _final_rank_equal(taxon, x),
            es_taxa
        )
        for match in final_match:
            mapping.add(taxon, match)
            final_matched.add(taxon)
    logger.info(
        f"{len(final_matched)} taxa matched by name of lowest rank alone")
    unmatched = unmatched.difference(final_matched)

    # Roll any genera which are unknown, but where we have a family entry,
    # up into the family entry
    if family_rollup:
        family_match: Set[str] = set()
        for taxon in unmatched:
            family: str = ";".join(taxon.split(";")[:-1]) + ";"
            if family in es_taxa:
                family_match.add(taxon)
                mapping.add(taxon, family)
        logger.info(
            f"{len(family_match)} genera rolled up to family "
            "(i.e Lachonspiraceae;CAG-95 -> Lachnospiraceae)")
        unmatched = unmatched.difference(family_match)

    # # Get updated abundance tables and W matrix
    # new_abd = mapping.transform_abundance(abd_tbl)
    # new_w = mapping.transform_w(es_w, new_abd)
    # # Match their ordering
    # new_abd = new_abd.loc[new_w.index]
    #
    # # Summarise loss of abundance and of W weights so user can assess whether
    # # this is acceptable
    # input_unique = new_w[new_w.sum(axis=1) == 0].index
    # input_abd_missed: float = new_abd.loc[input_unique].sum().sum()
    # input_abd_missed_prop: float = input_abd_missed / new_abd.sum().sum()
    # logger(f"{len(input_unique)} taxa unique to input, could not be matched" + \
    #        f" any entries in Enterosignatures matrix.")
    # logger(f"These unique taxa represent {input_abd_missed:.2f} abundance of" + \
    #        f" a total {new_abd.sum().sum():.2f} ({input_abd_missed_prop:.2%}).")
    #
    # # Loss of W weight
    # w_total: float = es_w.sum().sum()
    # new_w_total: float = new_w.sum().sum()
    # logger(f"Sum of weights in new W matrix is {new_w_total:.2f}; " + \
    #        f"Original Enterosignatures matrix sum is {w_total:.2f}. " + \
    #        f"{1 - (new_w_total / w_total):.2%} of ES matrix weight lost due to " + \
    #        "taxa mismatch.")
    return mapping




[docs]
def match_identical(w: pd.DataFrame, y: pd.DataFrame, **kwargs) -> (
        FeatureMapping):
    """Match features by identical labels only.

    :param w: W matrix from model
    :param y: Table of new data
    """
    target, source = set(w.index), set(y.index)
    mapping: FeatureMapping = FeatureMapping(
        target_features=target, source_features=source
    )
    for f in target.intersection(source):
        mapping.add(f, f)
    return mapping




[docs]
def nmf_transform(new_abd: pd.DataFrame,
                  w_prime: pd.DataFrame,
                  ) -> pd.DataFrame:
    """Transform the input data into model weights.
    
    Takes the matched up W matrix and feature matrix. Expects the row
    ordering of W and feature matrix to be the same. Any NA values will be
    filled with 0.

    :param new_abd: Feature matrix matched to W
    :param w_prime: Model weights
    :return: Model weights for the given model and abundances, note this is
        not relative abundance (do not sum to 1)
    """
    # TODO: Parameters for NNLS?
    h, _, _ = non_negative_factorization(
        new_abd.T.values,
        n_components=w_prime.shape[1],
        init="custom",  # Must be custom as providing H init
        verbose=False,
        solver="mu",
        max_iter=2000,
        random_state=None,
        l1_ratio=0.0,
        beta_loss="kullback-leibler",
        update_H=False,
        H=w_prime.T.values
    )
    h_df: pd.DataFrame = pd.DataFrame(
        h,
        index=new_abd.columns,
        columns=w_prime.columns
    )
    return h_df




[docs]
class FeatureMatch(Protocol):
    """Signature for functions which perform feature matching."""

    def __call__(self,
                 w: pd.DataFrame,
                 y: pd.DataFrame,
                 **kwargs) -> FeatureMapping:
        ...




[docs]
class InputValidation(Protocol):
    """Signature for functions which perform input validation."""

    def __call__(self,
                 y: pd.DataFrame,
                 **kwargs) -> pd.DataFrame:
        ...



def _reapply_model(
        y: pd.DataFrame,
        w: pd.DataFrame,
        colors: Optional[List[str]],
        input_validation: InputValidation,
        feature_match: FeatureMatch,
        **kwargs
) -> 'denovo.Decomposition':
    """Reapply a model to new data Y.

    New observations using the same features can be transformed to give
    signature weights for the new observations. However, for many types of
    data is unlikely the features will be in the exact same format, and
    that the exact same types of features will be observed.

    This function uses custom functions provided to validate and transform
    input, and then to match features. Note these are applied in this order if
    you apply any transformations such a log transform in validation this may
    have undesirable effects if features are combined and summed during
    feature matching.
    """
    from cvanmf.denovo import Decomposition, NMFParameters

    new_abd: pd.DataFrame = input_validation(y, **kwargs)
    mapping: FeatureMapping = feature_match(w, new_abd, **kwargs)

    # Make W' and Y' with matching row labels
    # Get updated abundance tables and W matrix
    new_abd = mapping.transform_abundance(new_abd)
    new_w = mapping.transform_w(w, new_abd)
    # Match their ordering
    new_abd = new_abd.loc[new_w.index]

    # Summarise loss of abundance and of W weights so user can assess whether
    # this is acceptable
    input_unique = new_w[new_w.sum(axis=1) == 0].index
    input_abd_missed: float = new_abd.loc[input_unique].sum().sum()
    input_abd_missed_prop: float = input_abd_missed / new_abd.sum().sum()
    logger.info(
        f"{len(input_unique)} features unique to input, could not be "
        f"matched any entries in signature W matrix.")
    logger.info(
        f"These unique features represent {input_abd_missed:.2f} weight of"
        f" a total {new_abd.sum().sum():.2f} ({input_abd_missed_prop:.2%}).")

    # Loss of W weight
    w_total: float = w.sum().sum()
    new_w_total: float = new_w.sum().sum()
    logger.info(
        f"Sum of weights in new W matrix is {new_w_total:.2f}; "
        f"Original signature matrix sum is {w_total:.2f}. "
        f"{1 - (new_w_total / w_total):.2%} of signature W matrix weight lost "
        f"due to feature mismatch.")

    # Transform to signature weights
    es: pd.DataFrame = nmf_transform(new_abd=new_abd,
                                     w_prime=new_w)
    decomp: Decomposition = Decomposition(
        w=new_w, h=es.T, feature_mapping=mapping,
        parameters=NMFParameters(
            x=new_abd,
            rank=new_w.shape[1],
            seed='Reapply'
        ),
    )
    decomp.colors = colors
    return decomp



[docs]
def reapply(y: Union[str, pd.DataFrame],
            model: Union[str, Signatures] = "5es",
            hard_mapping: Optional[Union[str, pd.DataFrame]] = None,
            separator: str = "\t",
            output_dir: Optional[str] = None,
            **kwargs
            ) -> 'denovo.Decomposition':
    """Load and transform abundances to an existing model.

    The new data must be annotated against the same taxonomy the model uses.
    Currently for the 5 ES models this is GTDB r207. Feature names
    will be automatically matched between the abundance table and model where
    possible, (see :func:`match_genera`). Most of the work is done in
    :func:`transform_table`, this mostly provides convenience of allowing
    parameters to be paths or DataFrames, or to specify models as string or
    object.

    :param y: Feature matrix to transform. Can be a string giving 
        path, or a DataFrame.
    :param model: Model to use. Can be a Signature object, or the name
        of one of the provded Signature objects. Currently this is '5es' for the
        5ES model of Frioux et al.
        (2023, https://doi.org/10.1016/j.chom.2023.05.024).
    :param hard_mapping: Define matchups between feature identifiers in
        y and those in model W matrix. These will be used in preference of any
        automated matches. Should be a table with index being y matrix
        identifier, and first column the model W identifier. Can be either a
        path, or DataFrame.
    :param separator: Separator to use when reading and writing matrices.
    :param output_dir: Directory to write results to. Directory will be created
        if it does not exist. Pass None for no output to disk.
    :param **kwargs: Passed to the Signature validate_input and match_feature
        functions.
    """

    # Load files if required
    abundance_df: pd.DataFrame
    if isinstance(y, str):
        abundance_df = pd.read_csv(y, sep=separator, index_col=0)
    else:
        abundance_df = y

    # Load model
    model_obj: Signatures
    if isinstance(model, str):
        # This is either a known model identifier, or a location for model
        # Check for known models
        from cvanmf import models
        if model.lower() == "5es":
            logger.info("Loading 5ES model")
            model_obj = models.five_es()
        else:
            raise CVANMFException(f"Model '{model}' not recognised.")
    else:
        model_obj = model

    # Load hard mapping
    mapping_dict: Dict[str, str]
    if hard_mapping is not None:
        if isinstance(hard_mapping, str):
            mapping_dict = pd.read_csv(hard_mapping, sep=separator).to_dict()
        else:
            mapping_dict = hard_mapping.to_dict()
    else:
        mapping_dict = {}

    res: 'denovo.Decomposition' = model_obj.reapply(
        y=abundance_df,
        hard_mapping=mapping_dict,
        **kwargs
    )

    if output_dir is not None:
        res.save(out_dir=pathlib.Path(output_dir), delim=separator)

    return res



@click.command()
@click.option("-i",
              "--input",
              required=True,
              type=click.Path(exists=True, dir_okay=False),
              help="""New feature matrix, with features on rows and samples on 
              columns.""")
@click.option("-m",
              "--model",
              required=False,
              # TODO: Define the list of models elsewhere
              type=click.Choice(['5es']),
              default="5es",
              help="""Name of the model to reapply.""")
@click.option("-h",
              "--hard_mapping",
              required=False,
              type=click.Path(exists=True, dir_okay=False),
              help="""Mapping between features in input table and model W 
              matrix. Provide as a csv, with first column features in input 
              table, second column the name in model W to map to.""")
@click.option("--rollup/--no-rollup",
              default=True,
              help="""Only used when genera are features. 
              Genera in abundance table which do not match the model W matrix, 
              add their abundance to a family level entry if one exists.""")
@click.option("-s", "--separator",
              default="\t", type=str,
              help="""Separator used in input and output files.""")
@click.option("-o", "--output_dir",
              required=True, type=click.Path(file_okay=False),
              help="""Directory to write output to.""")
def cli(input: str,
        model: str,
        hard_mapping: str,
        rollup: bool,
        separator: str,
        output_dir: str) -> None:
    """Command line interface to fit new data to an existing NMF Signatures
    model. The new data must use the same features as the model,
    though there can be some difference (features in now data not in model
    and vice versa). Currently this is GTDB r207 for the 5 Enterosignatures
    model.
    
    For more on Enterosignatures see:

    *  Frioux et al. 2023 (doi:10.1016/j.chom.2023.05.024)
    *  https://enterosignatures.quadram.ac.uk
    """

    if model is None:
        model = '5es'

    # Use transform function
    res: 'Decomposition' = reapply(
        y=input,
        model=model,
        hard_mapping=hard_mapping,
        rollup=rollup,
        separator=separator,
        output_dir=output_dir
    )