Hail - GWAS Tutorial¶

Load required libraries and initialize¶

[1]:

import hail as hl
from hail.plot import show
from pprint import pprint
import os
from IPython.display import HTML
hl.init()
hl.plot.output_notebook()

Loading BokehJS ...

[2]:

hl.spark_context()

[2]:

SparkContext

Spark UI

Version: v2.4.5
Master: local[*]
AppName: pyspark-shell

Get SparkUI url for Binder¶

[59]:

spark_ui = '<a>Use SparkUI from sparkContext</a>'
if os.environ.get('JUPYTERHUB_USER') is not None:
  spark_ui = \
    '<a href="https://hub.gke.mybinder.org/user/{0}/proxy/4040/jobs/">Spark UI for Binder</a>'.\
      format(os.environ.get('JUPYTERHUB_USER'))
HTML(spark_ui)

[59]:

Spark UI for Binder

Download public 1000 Genomes data¶

[4]:

hl.utils.get_1kg('/tmp/data/')

Importing data from VCF¶

[5]:

hl.import_vcf('/tmp/data/1kg.vcf.bgz').write('/tmp/data/1kg.mt', overwrite=True)

[8]:

mt = hl.read_matrix_table('/tmp/data/1kg.mt')

Explore data¶

[9]:

mt.rows().select().show(5)

locus	alleles
locus<GRCh37>	array<str>
1:904165	["G","A"]
1:909917	["G","A"]
1:986963	["C","T"]
1:1563691	["T","G"]
1:1707740	["T","G"]

showing top 5 rows

[10]:

mt.row_key.show(5)

locus	alleles
locus<GRCh37>	array<str>
1:904165	["G","A"]
1:909917	["G","A"]
1:986963	["C","T"]
1:1563691	["T","G"]
1:1707740	["T","G"]

showing top 5 rows

[11]:

mt.s.show(5)

s
str
"HG00096"
"HG00099"
"HG00105"
"HG00118"
"HG00129"

showing top 5 rows

[12]:

mt.entry.take(5)

[12]:

[Struct(GT=Call(alleles=[0, 0], phased=False), AD=[4, 0], DP=4, GQ=12, PL=[0, 12, 147]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[8, 0], DP=8, GQ=24, PL=[0, 24, 315]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[8, 0], DP=8, GQ=23, PL=[0, 23, 230]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[7, 0], DP=7, GQ=21, PL=[0, 21, 270]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[5, 0], DP=5, GQ=15, PL=[0, 15, 205])]

Adding column fields¶

[15]:

table = (hl.import_table('/tmp/data/1kg_annotations.txt', impute=True)
         .key_by('Sample'))

[16]:

table.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'Sample': str
    'Population': str
    'SuperPopulation': str
    'isFemale': bool
    'PurpleHair': bool
    'CaffeineConsumption': int32
----------------------------------------
Key: ['Sample']
----------------------------------------

[17]:

table.show(width=100)

Sample	Population	SuperPopulation	isFemale	PurpleHair	CaffeineConsumption
str	str	str	bool	bool	int32
"HG00096"	"GBR"	"EUR"	false	false	4
"HG00097"	"GBR"	"EUR"	true	true	4
"HG00098"	"GBR"	"EUR"	false	false	5
"HG00099"	"GBR"	"EUR"	true	false	4
"HG00100"	"GBR"	"EUR"	true	false	5
"HG00101"	"GBR"	"EUR"	false	true	1
"HG00102"	"GBR"	"EUR"	true	true	6
"HG00103"	"GBR"	"EUR"	false	true	5
"HG00104"	"GBR"	"EUR"	true	false	5
"HG00105"	"GBR"	"EUR"	false	false	4

showing top 10 rows

[18]:

print(mt.col.dtype)

struct{s: str}

[19]:

mt = mt.annotate_cols(pheno = table[mt.s])

[20]:

mt.col.describe()

--------------------------------------------------------
Type:
        struct {
        s: str,
        pheno: struct {
            Population: str,
            SuperPopulation: str,
            isFemale: bool,
            PurpleHair: bool,
            CaffeineConsumption: int32
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f70afe470>
Index:
    ['column']
--------------------------------------------------------

Using query functions and the Hail Expression Language¶

[21]:

pprint(table.aggregate(hl.agg.counter(table.SuperPopulation)))

{'AFR': 1018, 'AMR': 535, 'EAS': 617, 'EUR': 669, 'SAS': 661}

[22]:

pprint(table.aggregate(hl.agg.stats(table.CaffeineConsumption)))

{'max': 10.0,
 'mean': 3.9837142857142855,
 'min': -1.0,
 'n': 3500,
 'stdev': 1.7021055628070711,
 'sum': 13943.0}

[23]:

table.count()

[23]:

[24]:

mt.count_cols()

[24]:

[25]:

mt.aggregate_cols(hl.agg.counter(mt.pheno.SuperPopulation))

[25]:

{'AFR': 76, 'EAS': 72, 'AMR': 34, 'SAS': 55, 'EUR': 47}

[26]:

pprint(mt.aggregate_cols(hl.agg.stats(mt.pheno.CaffeineConsumption)))

{'max': 9.0,
 'mean': 4.415492957746479,
 'min': 0.0,
 'n': 284,
 'stdev': 1.577763427465917,
 'sum': 1254.0}

[27]:

snp_counts = mt.aggregate_rows(hl.agg.counter(hl.Struct(ref=mt.alleles[0], alt=mt.alleles[1])))
pprint(snp_counts)

{Struct(ref='C', alt='T'): 2436,
 Struct(ref='T', alt='G'): 468,
 Struct(ref='A', alt='T'): 76,
 Struct(ref='A', alt='C'): 454,
 Struct(ref='A', alt='G'): 1944,
 Struct(ref='G', alt='A'): 2387,
 Struct(ref='T', alt='A'): 79,
 Struct(ref='T', alt='C'): 1879,
 Struct(ref='G', alt='T'): 480,
 Struct(ref='C', alt='G'): 150,
 Struct(ref='C', alt='A'): 496,
 Struct(ref='G', alt='C'): 112}

[28]:

from collections import Counter
counts = Counter(snp_counts)
counts.most_common()

[28]:

[(Struct(ref='C', alt='T'), 2436),
 (Struct(ref='G', alt='A'), 2387),
 (Struct(ref='A', alt='G'), 1944),
 (Struct(ref='T', alt='C'), 1879),
 (Struct(ref='C', alt='A'), 496),
 (Struct(ref='G', alt='T'), 480),
 (Struct(ref='T', alt='G'), 468),
 (Struct(ref='A', alt='C'), 454),
 (Struct(ref='C', alt='G'), 150),
 (Struct(ref='G', alt='C'), 112),
 (Struct(ref='T', alt='A'), 79),
 (Struct(ref='A', alt='T'), 76)]

[29]:

p = hl.plot.histogram(mt.DP, range=(0,30), bins=30, title='DP Histogram', legend='DP')
show(p)

Quality Control¶

[30]:

mt.col.describe()

--------------------------------------------------------
Type:
        struct {
        s: str,
        pheno: struct {
            Population: str,
            SuperPopulation: str,
            isFemale: bool,
            PurpleHair: bool,
            CaffeineConsumption: int32
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f70afe470>
Index:
    ['column']
--------------------------------------------------------

[31]:

mt = hl.sample_qc(mt)

[32]:

mt.col.describe()

--------------------------------------------------------
Type:
        struct {
        s: str,
        pheno: struct {
            Population: str,
            SuperPopulation: str,
            isFemale: bool,
            PurpleHair: bool,
            CaffeineConsumption: int32
        },
        sample_qc: struct {
            dp_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            gq_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            call_rate: float64,
            n_called: int64,
            n_not_called: int64,
            n_filtered: int64,
            n_hom_ref: int64,
            n_het: int64,
            n_hom_var: int64,
            n_non_ref: int64,
            n_singleton: int64,
            n_snp: int64,
            n_insertion: int64,
            n_deletion: int64,
            n_transition: int64,
            n_transversion: int64,
            n_star: int64,
            r_ti_tv: float64,
            r_het_hom_var: float64,
            r_insertion_deletion: float64
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f709cfb00>
Index:
    ['column']
--------------------------------------------------------

Plotting the QC metrics¶

[33]:

p = hl.plot.histogram(mt.sample_qc.call_rate, range=(.88,1), legend='Call Rate')
show(p)

[34]:

p = hl.plot.histogram(mt.sample_qc.gq_stats.mean, range=(10,70), legend='Mean Sample GQ')
show(p)

[35]:

p = hl.plot.scatter(mt.sample_qc.dp_stats.mean, mt.sample_qc.call_rate, xlabel='Mean DP', ylabel='Call Rate')
show(p)

[36]:

mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
print('After filter, %d/284 samples remain.' % mt.count_cols())

After filter, 250/284 samples remain.

[37]:

ab = mt.AD[1] / hl.sum(mt.AD)

filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt.GT.is_hom_var() & (ab >= 0.9)))

fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab))
print(f'Filtering {fraction_filtered * 100:.2f}% entries out of downstream analysis.')
mt = mt.filter_entries(filter_condition_ab)

Filtering 3.64% entries out of downstream analysis.

[38]:

mt = hl.variant_qc(mt)

[39]:

mt.row.describe()

--------------------------------------------------------
Type:
        struct {
        locus: locus<GRCh37>,
        alleles: array<str>,
        rsid: str,
        qual: float64,
        filters: set<str>,
        info: struct {
            AC: array<int32>,
            AF: array<float64>,
            AN: int32,
            BaseQRankSum: float64,
            ClippingRankSum: float64,
            DP: int32,
            DS: bool,
            FS: float64,
            HaplotypeScore: float64,
            InbreedingCoeff: float64,
            MLEAC: array<int32>,
            MLEAF: array<float64>,
            MQ: float64,
            MQ0: int32,
            MQRankSum: float64,
            QD: float64,
            ReadPosRankSum: float64,
            set: str
        },
        variant_qc: struct {
            dp_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            gq_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            AC: array<int32>,
            AF: array<float64>,
            AN: int32,
            homozygote_count: array<int32>,
            call_rate: float64,
            n_called: int64,
            n_not_called: int64,
            n_filtered: int64,
            n_het: int64,
            n_non_ref: int64,
            het_freq_hwe: float64,
            p_value_hwe: float64
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f7097f390>
Index:
    ['row']
--------------------------------------------------------

GWAS Analysis¶

[40]:

mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

[41]:

mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)

[42]:

print('Samples: %d  Variants: %d' % (mt.count_cols(), mt.count_rows()))

Samples: 250  Variants: 7849

[43]:

gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[1.0])
gwas.row.describe()

--------------------------------------------------------
Type:
        struct {
        locus: locus<GRCh37>,
        alleles: array<str>,
        n: int32,
        sum_x: float64,
        y_transpose_x: float64,
        beta: float64,
        standard_error: float64,
        t_stat: float64,
        p_value: float64
    }
--------------------------------------------------------
Source:
    <hail.table.Table object at 0x7f0f7092e358>
Index:
    ['row']
--------------------------------------------------------

[44]:

p = hl.plot.manhattan(gwas.p_value)
show(p)

[45]:

p = hl.plot.qq(gwas.p_value)
show(p)

Confounded¶

[46]:

eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

[47]:

pprint(eigenvalues)

[18.02377947184681,
 9.98894555036326,
 3.538312262917122,
 2.6577590783730023,
 1.5966032147658327,
 1.5416611649602379,
 1.5029872248781773,
 1.4720816378531194,
 1.4678188487330746,
 1.4477835201334939]

[48]:

pcs.show(5, width=100)

s	scores
str	array<float64>
"HG00096"	[-1.22e-01,-2.81e-01,1.11e-01,-1.28e-01,6.81e-02,-3.72e-03,-2.66e-02,4.99e-03,-9.33e-02,-1.48...
"HG00099"	[-1.13e-01,-2.90e-01,1.08e-01,-7.04e-02,4.20e-02,3.33e-02,1.61e-02,-1.15e-03,3.29e-02,2.33e-02]
"HG00105"	[-1.08e-01,-2.80e-01,1.03e-01,-1.05e-01,9.40e-02,1.27e-02,3.14e-02,3.08e-02,1.06e-02,-1.93e-02]
"HG00118"	[-1.25e-01,-2.98e-01,7.21e-02,-1.07e-01,2.89e-02,8.09e-03,-4.70e-02,-3.32e-02,-2.59e-04,8.49e...
"HG00129"	[-1.07e-01,-2.87e-01,9.72e-02,-1.16e-01,1.38e-02,1.87e-02,-8.37e-02,-4.87e-02,3.73e-02,2.11e-02]

showing top 5 rows

[49]:

mt = mt.annotate_cols(scores = pcs[mt.s].scores)

[50]:

p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    label=mt.pheno.SuperPopulation,
                    title='PCA', xlabel='PC1', ylabel='PC2')
show(p)

[51]:

gwas = hl.linear_regression_rows(
    y=mt.pheno.CaffeineConsumption,
    x=mt.GT.n_alt_alleles(),
    covariates=[1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2]])

[52]:

p = hl.plot.qq(gwas.p_value)
show(p)

[53]:

p = hl.plot.manhattan(gwas.p_value)
show(p)

Rare variant analysis¶

[54]:

entries = mt.entries()
results = (entries.group_by(pop = entries.pheno.SuperPopulation, chromosome = entries.locus.contig)
      .aggregate(n_het = hl.agg.count_where(entries.GT.is_het())))

[55]:

results.show()

pop	chromosome	n_het
str	str	int64
"AFR"	"1"	11276
"AFR"	"10"	7160
"AFR"	"11"	6875
"AFR"	"12"	7048
"AFR"	"13"	4678
"AFR"	"14"	4313
"AFR"	"15"	3904
"AFR"	"16"	4593
"AFR"	"17"	3718
"AFR"	"18"	4171

showing top 10 rows

[56]:

entries = entries.annotate(maf_bin = hl.cond(entries.info.AF[0]<0.01, "< 1%",
                             hl.cond(entries.info.AF[0]<0.05, "1%-5%", ">5%")))

results2 = (entries.group_by(af_bin = entries.maf_bin, purple_hair = entries.pheno.PurpleHair)
      .aggregate(mean_gq = hl.agg.stats(entries.GQ).mean,
                 mean_dp = hl.agg.stats(entries.DP).mean))

[57]:

results2.show()

af_bin	purple_hair	mean_gq	mean_dp
str	bool	float64	float64
"1%-5%"	false	2.48e+01	7.43e+00
"1%-5%"	true	2.46e+01	7.47e+00
"< 1%"	false	2.35e+01	7.55e+00
"< 1%"	true	2.35e+01	7.53e+00
">5%"	false	3.70e+01	7.65e+00
">5%"	true	3.73e+01	7.70e+00

Citation¶

[58]:

hl.citation()

[58]:

'Hail Team. Hail 0.2.38-9d0be8b84f35. https://github.com/hail-is/hail/commit/9d0be8b84f35.'

[ ]: