Source code for cgr_gwas_qc.parsers.graf

"""
Genetic Relationship and Fingerprinting
---------------------------------------

GRAF is a package that allows estmation or relatedness and ancestry.

Relatedness
+++++++++++

.. csv-table::
    :header: name, dtype, description

    ID1, string,
    ID2, string,
    HG_match, int, number of SNPs with matched genotypes when only homozygous SNPs are counted
    HG_miss, int, number of SNPs with mismatched genotypes when only homozygous SNPs are counted
    HGMR, float, Homozygous Genotype Mismatch Rate (%)
    AG_match, int, number of SNPs with matched genotypes when all SNPs are counted
    AG_miss, int, number of SNPs with mismatched genotypes when all SNPs are counted
    AGMR, float, All Genotype Mismatch Rate (%)
    relationship, string, relationship determined by sample genotypes.
    p_value, float, probability that the genetic relationship is NOT the predicted type

Relationship Values
*******************

Categories are assigned by GRAF.

.. csv-table::
    :header: name, description

    ID, duplicate or MZ twin
    PO, parent-offspring
    FS, full sibling
    D2, 2nd degree relative
    D3, 3rd degree relative
    UN, unrelated

References:
    - https://github.com/ncbi/graf
    - Jin Y, Schäffer AA, Sherry ST, and Feolo M (2017). Quickly identifying
      identical and closely related subjects in large databases using genotype
      data. PLoS One. 12(6):e0179106.
"""

import pandas as pd

from cgr_gwas_qc.typing import PathLike

DTYPES = {
    "ID1": "string",
    "ID2": "string",
    "HG_match": "UInt32",
    "HG_miss": "UInt32",
    "HGMR": "float",
    "AG_match": "UInt32",
    "AG_miss": "UInt32",
    "AGMR": "float",
    "relationship": "string",
    "p_value": "float",
}


[docs]def read_relatedness(filename: PathLike) -> pd.DataFrame:
    """Reads the table generated by ``graf --out``

    Returns:
        pd.DataFrame

        - ID1
        - ID2
        - HG_match
        - HG_miss
        - HGMR
        - AG_match
        - AG_miss
        - AGMR
        - relationship {ID, PO, FS, D2, D3, UN}
        - p_value

    References:
        - https://github.com/ncbi/graf#output-files
    """

    def _sort_ids(x: pd.Series) -> pd.Series:
        x["ID1"], x["ID2"] = sorted([x.sample1, x.sample2])
        return x

    return (
        pd.read_csv(filename, sep="\t", comment="#")
        .apply(_sort_ids, axis=1)
        .rename(
            {
                "HG match": "HG_match",
                "HG miss": "HG_miss",
                "AG match": "AG_match",
                "AG miss": "AG_miss",
                "geno relation": "relationship",
            },
            axis=1,
        )
        .reindex(DTYPES.keys(), axis=1)
    )