Source code for galaxychop.models._base

# This file is part of
# the galaxy-chop project (https://github.com/vcristiani/galaxy-chop)
# Copyright (c) 2021, Valeria Cristiani
# License: MIT
# Full Text: https://github.com/vcristiani/galaxy-chop/blob/master/LICENSE.txt

"""Common functionalities for galaxy decomposition."""


# =============================================================================
# IMPORTS
# =============================================================================

import abc
from collections import OrderedDict

import attr
from attr import validators as vldt

import numpy as np

import pandas as pd

from .. import data, utils
from ..utils import doc_inherit

# =============================================================================
# CONSTANTS
# =============================================================================

_CIRCULARITY_ATTRIBUTES = utils.JCirc.circularity_attributes()

_PTYPES_ORDER = tuple(p.name.lower() for p in data.ParticleSetType)


# =============================================================================
# RESULT
# =============================================================================


[docs]@attr.s(frozen=True, slots=True, repr=False) class Components: """Class of components resulting from dynamic decomposition. This class creates the components of the galaxy from the result of the dynamic decomposition. Parameters ---------- labels : np.ndarray 1D array with the index of the component to which each particle belongs. Shape: (n,1). ptypes : np.ndarray Indicates the type of particle: stars = 0, dark matter = 1, gas = 2. Shape: (n,1). probabilities : np.ndarray or None 1D array with probabilities of the particles to belong to each component, in case the dynamic decomposition model includes them. Shape: (n,1). Otherwise it adopts the value None. """ labels = attr.ib(validator=vldt.instance_of(np.ndarray)) ptypes = attr.ib(validator=vldt.instance_of(np.ndarray)) probabilities = attr.ib( validator=vldt.optional(vldt.instance_of(np.ndarray)) ) def __attrs_post_init__(self): """Length validator. This method validates that the lengths of labels, ptypes are equal. On the other hand, if probabilities is not None, its length must be the same as ptypes and labels. """ lens = {len(self.labels), len(self.ptypes)} if self.probabilities is not None: lens.add(len(self.probabilities)) if len(lens) > 1: raise ValueError("All length must be the same") def __len__(self): """x.__len__() <==> len(x).""" return len(self.labels) def __repr__(self): """x.__repr__() <==> repr(x).""" length = len(self) labels = np.unique(self.labels) probs = True if self.probabilities is not None else False return f"Components({length}, labels={labels}, probabilities={probs})"
[docs] def to_dataframe(self, attributes=None): """ Convert to pandas data frame. This method builds a data frame of all parameters of Components. Return ------ DataFrame : pandas.DataFrame DataFrame of all Components data. """ columns_makers = { "labels": lambda: self.labels, "ptypes": lambda: self.ptypes, } attributes = ( list(columns_makers) + ["probabilities"] if attributes is None else attributes ) data = OrderedDict() probs_df = None for aname in attributes: if aname == "probabilities": if self.probabilities is not None: probs_df = pd.DataFrame(self.probabilities) probs_df.columns = [f"probs_{c}" for c in probs_df.columns] else: mkcolumn = columns_makers[aname] data[aname] = mkcolumn() df = pd.DataFrame(data) if probs_df is not None: df = pd.concat([df, probs_df], axis=1) return df
# ============================================================================= # FUNCTIONS # =============================================================================
[docs]def hparam(default, **kwargs): """Create a hyper parameter for decomposers. By design decision, hyper-parameter is required to have a sensitive default value. Parameters ---------- default : Sensitive default value of the hyper-parameter. **kwargs : Additional keyword arguments are passed and are documented in ``attr.ib()``. Return ------ Hyper parameter with a default value. Notes ----- This function is a thin-wrapper over the attrs function ``attr.ib()``. """ metadata = kwargs.pop("metadata", {}) metadata["__gchop_model_hparam__"] = True return attr.ib(default=default, metadata=metadata, kw_only=True, **kwargs)
# ============================================================================= # ABC # =============================================================================
[docs]@attr.s(frozen=True, repr=False) class GalaxyDecomposerABC(metaclass=abc.ABCMeta): """Abstract class to facilitate the creation of decomposers. This class requests the redefinition of three methods: get_attributes, get_rows_mask and split. Parameters ---------- cbins : tuple It contains the two widths of bins necessary for the calculation of the circular angular momentum. Shape: (2,). Dafult value = (0.05, 0.005). """ __gchop_model_cls_config__ = {"repr": False, "frozen": True} cbins = hparam(default=utils.DEFAULT_CBIN) @cbins.validator def _bins_validator(self, attribute, value): if not ( isinstance(value, tuple) and len(value) == 2 and isinstance(value[0], float) and isinstance(value[1], float) ): raise ValueError("cbins must be a tuple of two floats.") # block meta checks ======================================================= def __init_subclass__(cls): """Initiate of subclasses. It ensures that every inherited class is decorated by ``attr.s()`` and assigns as class configuration the parameters defined in the class variable `__gchop_model_cls_config__`. In other words it is slightly equivalent to: .. code-block:: python @attr.s(**GalaxyDecomposerABC.__gchop_model_cls_config__) class Decomposer(GalaxyDecomposerABC): pass """ model_config = getattr(cls, "__gchop_model_cls_config__") attr.s(maybe_cls=cls, **model_config) # block to implement in every method =====================================
[docs] @abc.abstractmethod def get_attributes(self): """Attributes for the parameter space. Returns ------- attributes : keys of ``ParticleSet class`` parameters Particle attributes used to operate the clustering. """ raise NotImplementedError()
[docs] @abc.abstractmethod def get_rows_mask(self, X, y, attributes): """Mask for the valid rows to operate clustering. This method gets the mask for the valid rows to operate clustering. Parameters ---------- X : np.ndarray(n_particles, attributes) 2D array where each file it is a diferent particle and each column is a attribute of the particles. n_particles is the total number of particles. y : np.ndarray(n_particles,) 1D array where is identified the nature of each particle: 0 = stars, 1 = dark matter, 2 = gas. n_particles is the total number of particles. attributes: tuple Dictionary keys of ``ParticleSet class`` parameters with particle attributes used to operate the clustering. Returns ------- mask : nd.array(m_particles) Mask only with valid values to operate the clustering. """ raise NotImplementedError()
[docs] @abc.abstractmethod def split(self, X, y, attributes): """Compute clustering. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training instances to cluster. y : Ignored Not used, present here for API consistency by convention. Returns ------- labels : np.ndarray(m_particles) 1D array with the index of the clusters to which each particle belongs. m_particles is the total number of particles with valid values to operate the clustering. probs : np.ndarray(m_particles) or None Probabilities of the particles to belong to each component, in case the dynamic decomposition model includes them. Otherwise it adopts the value None. """ raise NotImplementedError()
# internal ================================================================ def __repr__(self): """x.__repr__() <==> repr(x).""" clsname = type(self).__name__ selfd = attr.asdict( self, recurse=False, filter=lambda attr, _: attr.repr, ) attrs_str = ", ".join([f"{k}={repr(v)}" for k, v in selfd.items()]) return f"{clsname}({attrs_str})" # API ===================================================================== def _get_jcirc_df(self, galaxy, attributes): # STARS # turn the galaxy into jcirc dict # all the calculation cames together so we can't optimize here jcirc = utils.jcirc(galaxy, *self.cbins).as_dict() # we add the colum with the types, all the values from jcirc # are stars jcirc["ptypev"] = data.ParticleSetType.STARS.value stars_df = pd.DataFrame({attr: jcirc[attr] for attr in attributes}) # DARK_MATTER dm_rows = len(galaxy.dark_matter) dm_nans = np.full(dm_rows, np.nan) dm_columns = {attr: dm_nans for attr in attributes} dm_columns["ptypev"] = data.ParticleSetType.DARK_MATTER.value dm_df = pd.DataFrame(dm_columns) # GAS gas_rows = len(galaxy.gas) gas_nans = np.full(gas_rows, np.nan) gas_columns = {attr: gas_nans for attr in attributes} gas_columns["ptypev"] = data.ParticleSetType.GAS.value gas_df = pd.DataFrame(gas_columns) return pd.concat([stars_df, dm_df, gas_df], ignore_index=True)
[docs] def attributes_matrix(self, galaxy, attributes): """Matrix of particle attributes. This method obtains the matrix with the particles and attributes necessary to operate the clustering. Parameters ---------- galaxy : ``Galaxy class`` object Instance of Galaxy class. attributes : keys of ``ParticleSet class`` parameters Particle attributes used to operate the clustering. Returns ------- X : np.ndarray(n_particles, attributes) 2D array where each file it is a diferent particle and each column is a attribute of the particles. n_particles is the total number of particles. y : np.ndarray(n_particles) 1D array where is identified the nature of each particle: 0 = STARS, 1=DM, 2=Gas. n_particles is the total number of particles. """ # first we split the attributes between the ones from circularity # and the ones from "galaxy.to_dataframe()" circ_attrs, df_attrs = [], [] for attr_name in attributes: container = ( circ_attrs if attr_name in _CIRCULARITY_ATTRIBUTES else df_attrs ) container.append(attr_name) # this crap is going to have all the dataframes that contain as a # column each attribute result = [] # If we have attributes of "to_dataframe" ============================= # now we take out all the attributes of "to_dataframe" and save # them in a list where all the resulting dataframes will be stored if df_attrs: # we need this to create the array of classes if "ptypev" not in df_attrs: df_attrs.append("ptypev") dfgal = galaxy.to_dataframe( ptypes=_PTYPES_ORDER, attributes=df_attrs ) result.append(dfgal) # If we have JCIRC attributes ========================================= # I'm going to need a lot of NANs that represent that gas and dm # have no circularity. if circ_attrs: circ_attrs.append("ptypev") dfcirc = self._get_jcirc_df(galaxy, circ_attrs) result.append(dfcirc) # the attributes as dataframe df = pd.concat(result, axis=1) # remove if ptypev is duplicated df = df.loc[:, ~df.columns.duplicated()] # separamos la matriz y las clases X = df[attributes].to_numpy() y = df.ptypev.to_numpy() # retornamos return X, y
[docs] def complete_labels(self, X, labels, rows_mask): """Complete the labels of all particles. This method assigns the labels obtained from clustering to the particles used for this purpose. The rest are assigned as label=Nan. Parameters ---------- X : np.ndarray(n_particles, attributes) 2D array where each file it is a diferent particle and each column is a parameter of the particles. n_particles is the total number of particles. labels: np.ndarray(m_particles) 1D array with the index of the clusters to which each particle belongs. m_particles is the total number of particles with valid values to operate the clustering. rows_mask : nd.array(m_particles) Mask only with valid values to operate the clustering. m_particles is the total number of particles with valid values to operate the clustering. Return ------ new_labels: np.ndarray(n_particles) 1D array with the index of the clusters to which each particle belongs. Particles that do not belong to any of them are assigned the label Nan. n_particles is the total number of particles. """ new_labels = np.full(len(X), np.nan) new_labels[rows_mask] = labels return new_labels
[docs] def complete_probs(self, X, probs, rows_mask): """Complete the probabilities of all particles. This method assigns the probabilities obtained from clustering to the particles used for this purpose, the rest are assigned as label=Nan. This method returns None in case the clustering method returns None probabilities. Parameters ---------- X : np.ndarray(n_particles, attributes) 2D array where each file it is a diferent particle and each column is a parameter of the particles. n_particles is the total number of particles. probs: np.ndarray(n_cluster, m_particles) 2D array with probabilities of belonging to each component. n_cluster is the number of components obtained. m_particles is the total number of particles with valid values to operate the clustering. rows_mask : nd.array(m_particles) Mask only with valid values to operate the clustering. m_particles is the total number of particles with valid values to operate the clustering. Return ------ new_probs: np.ndarray(n_cluster, n_particles) 2D array with probabilities of belonging to each component. n_cluster is the number of components obtained. n_particles is the total number of particles. Particles that do not belong to any component are assigned the label Nan. This method returns None in case the clustering method returns None probabilities. """ if probs is None: return None # the number of particles are incorrect so we simple remove the data probs_shape = list(np.shape(probs)[1:]) # We need this many rows complete_shape = tuple([len(X)] + probs_shape) # now we create the container for the probabilities new_probs = np.full(complete_shape, np.nan) # and now we inject the probs in the correct order new_probs[rows_mask] = probs return new_probs
[docs] def decompose(self, galaxy): """Decompose method. Assign the component of the galaxy to which each particle belongs. Validation of the input galaxy instance. Parameters ---------- galaxy : ``Galaxy class`` object Instance of Galaxy class. Return ------ Components : Instance of the ``Component class``, with the result of the dynamic decomposition. """ attributes = self.get_attributes() X, y = self.attributes_matrix(galaxy, attributes=attributes) # calculate only the valid values to operate the clustering rows_mask = self.get_rows_mask(X=X, y=y, attributes=attributes) X_clean, y_clean = X[rows_mask], y[rows_mask] # execute the cluster with the quantities of interest labels, probs = self.split(X=X_clean, y=y_clean, attributes=attributes) # retrieve and fix the labels final_labels = self.complete_labels( X=X, labels=labels, rows_mask=rows_mask ) final_probs = self.complete_probs( X=X, probs=probs, rows_mask=rows_mask ) final_y = np.array( [data.ParticleSetType.mktype(yi).humanize() for yi in y] ) # return the instance return Components( labels=final_labels, ptypes=final_y, probabilities=final_probs, )
# ============================================================================= # MIXIN # =============================================================================
[docs]class DynamicStarsDecomposerMixin: """Dynamic Stars Decomposer Mixin Class. This class redefines the get_row_mask method so that dynamic decomposition is performed using only stellar particles. """
[docs] @doc_inherit(GalaxyDecomposerABC.get_rows_mask) def get_rows_mask(self, X, y, attributes): """ Note ---- Only stellar particles are used to carry out the dynamic decomposition. In addition, the parameters of the parameter space, where the dynamic decomposition is carried out, must have finite values. """ # all the rows where every value is finite only_stars = np.equal(y, data.ParticleSetType.STARS.value) finite_values = np.isfinite(X).all(axis=1) return only_stars & finite_values