Hail - GWAS Tutorial

Load required libraries and initialize

[1]:
import hail as hl
from hail.plot import show
from pprint import pprint
import os
from IPython.display import HTML
hl.init()
hl.plot.output_notebook()
Loading BokehJS ...
[2]:
hl.spark_context()
[2]:

SparkContext

Spark UI

Version
v2.4.5
Master
local[*]
AppName
pyspark-shell

Get SparkUI url for Binder

[59]:
spark_ui = '<a>Use SparkUI from sparkContext</a>'
if os.environ.get('JUPYTERHUB_USER') is not None:
  spark_ui = \
    '<a href="https://hub.gke.mybinder.org/user/{0}/proxy/4040/jobs/">Spark UI for Binder</a>'.\
      format(os.environ.get('JUPYTERHUB_USER'))
HTML(spark_ui)

Download public 1000 Genomes data

[4]:
hl.utils.get_1kg('/tmp/data/')

Importing data from VCF

[5]:
hl.import_vcf('/tmp/data/1kg.vcf.bgz').write('/tmp/data/1kg.mt', overwrite=True)
[8]:
mt = hl.read_matrix_table('/tmp/data/1kg.mt')

Explore data

[9]:
mt.rows().select().show(5)
locusalleles
locus<GRCh37>array<str>
1:904165["G","A"]
1:909917["G","A"]
1:986963["C","T"]
1:1563691["T","G"]
1:1707740["T","G"]

showing top 5 rows

[10]:
mt.row_key.show(5)
locusalleles
locus<GRCh37>array<str>
1:904165["G","A"]
1:909917["G","A"]
1:986963["C","T"]
1:1563691["T","G"]
1:1707740["T","G"]

showing top 5 rows

[11]:
mt.s.show(5)
s
str
"HG00096"
"HG00099"
"HG00105"
"HG00118"
"HG00129"

showing top 5 rows

[12]:
mt.entry.take(5)
[12]:
[Struct(GT=Call(alleles=[0, 0], phased=False), AD=[4, 0], DP=4, GQ=12, PL=[0, 12, 147]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[8, 0], DP=8, GQ=24, PL=[0, 24, 315]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[8, 0], DP=8, GQ=23, PL=[0, 23, 230]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[7, 0], DP=7, GQ=21, PL=[0, 21, 270]),
 Struct(GT=Call(alleles=[0, 0], phased=False), AD=[5, 0], DP=5, GQ=15, PL=[0, 15, 205])]

Adding column fields

[15]:
table = (hl.import_table('/tmp/data/1kg_annotations.txt', impute=True)
         .key_by('Sample'))
[16]:
table.describe()
----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'Sample': str
    'Population': str
    'SuperPopulation': str
    'isFemale': bool
    'PurpleHair': bool
    'CaffeineConsumption': int32
----------------------------------------
Key: ['Sample']
----------------------------------------
[17]:
table.show(width=100)
SamplePopulationSuperPopulationisFemalePurpleHairCaffeineConsumption
strstrstrboolboolint32
"HG00096""GBR""EUR"falsefalse4
"HG00097""GBR""EUR"truetrue4
"HG00098""GBR""EUR"falsefalse5
"HG00099""GBR""EUR"truefalse4
"HG00100""GBR""EUR"truefalse5
"HG00101""GBR""EUR"falsetrue1
"HG00102""GBR""EUR"truetrue6
"HG00103""GBR""EUR"falsetrue5
"HG00104""GBR""EUR"truefalse5
"HG00105""GBR""EUR"falsefalse4

showing top 10 rows

[18]:
print(mt.col.dtype)
struct{s: str}
[19]:
mt = mt.annotate_cols(pheno = table[mt.s])
[20]:
mt.col.describe()
--------------------------------------------------------
Type:
        struct {
        s: str,
        pheno: struct {
            Population: str,
            SuperPopulation: str,
            isFemale: bool,
            PurpleHair: bool,
            CaffeineConsumption: int32
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f70afe470>
Index:
    ['column']
--------------------------------------------------------

Using query functions and the Hail Expression Language

[21]:
pprint(table.aggregate(hl.agg.counter(table.SuperPopulation)))
{'AFR': 1018, 'AMR': 535, 'EAS': 617, 'EUR': 669, 'SAS': 661}
[22]:
pprint(table.aggregate(hl.agg.stats(table.CaffeineConsumption)))
{'max': 10.0,
 'mean': 3.9837142857142855,
 'min': -1.0,
 'n': 3500,
 'stdev': 1.7021055628070711,
 'sum': 13943.0}
[23]:
table.count()
[23]:
3500
[24]:
mt.count_cols()
[24]:
284
[25]:
mt.aggregate_cols(hl.agg.counter(mt.pheno.SuperPopulation))
[25]:
{'AFR': 76, 'EAS': 72, 'AMR': 34, 'SAS': 55, 'EUR': 47}
[26]:
pprint(mt.aggregate_cols(hl.agg.stats(mt.pheno.CaffeineConsumption)))
{'max': 9.0,
 'mean': 4.415492957746479,
 'min': 0.0,
 'n': 284,
 'stdev': 1.577763427465917,
 'sum': 1254.0}
[27]:
snp_counts = mt.aggregate_rows(hl.agg.counter(hl.Struct(ref=mt.alleles[0], alt=mt.alleles[1])))
pprint(snp_counts)
{Struct(ref='C', alt='T'): 2436,
 Struct(ref='T', alt='G'): 468,
 Struct(ref='A', alt='T'): 76,
 Struct(ref='A', alt='C'): 454,
 Struct(ref='A', alt='G'): 1944,
 Struct(ref='G', alt='A'): 2387,
 Struct(ref='T', alt='A'): 79,
 Struct(ref='T', alt='C'): 1879,
 Struct(ref='G', alt='T'): 480,
 Struct(ref='C', alt='G'): 150,
 Struct(ref='C', alt='A'): 496,
 Struct(ref='G', alt='C'): 112}
[28]:
from collections import Counter
counts = Counter(snp_counts)
counts.most_common()
[28]:
[(Struct(ref='C', alt='T'), 2436),
 (Struct(ref='G', alt='A'), 2387),
 (Struct(ref='A', alt='G'), 1944),
 (Struct(ref='T', alt='C'), 1879),
 (Struct(ref='C', alt='A'), 496),
 (Struct(ref='G', alt='T'), 480),
 (Struct(ref='T', alt='G'), 468),
 (Struct(ref='A', alt='C'), 454),
 (Struct(ref='C', alt='G'), 150),
 (Struct(ref='G', alt='C'), 112),
 (Struct(ref='T', alt='A'), 79),
 (Struct(ref='A', alt='T'), 76)]
[29]:
p = hl.plot.histogram(mt.DP, range=(0,30), bins=30, title='DP Histogram', legend='DP')
show(p)

Quality Control

[30]:
mt.col.describe()
--------------------------------------------------------
Type:
        struct {
        s: str,
        pheno: struct {
            Population: str,
            SuperPopulation: str,
            isFemale: bool,
            PurpleHair: bool,
            CaffeineConsumption: int32
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f70afe470>
Index:
    ['column']
--------------------------------------------------------
[31]:
mt = hl.sample_qc(mt)
[32]:
mt.col.describe()
--------------------------------------------------------
Type:
        struct {
        s: str,
        pheno: struct {
            Population: str,
            SuperPopulation: str,
            isFemale: bool,
            PurpleHair: bool,
            CaffeineConsumption: int32
        },
        sample_qc: struct {
            dp_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            gq_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            call_rate: float64,
            n_called: int64,
            n_not_called: int64,
            n_filtered: int64,
            n_hom_ref: int64,
            n_het: int64,
            n_hom_var: int64,
            n_non_ref: int64,
            n_singleton: int64,
            n_snp: int64,
            n_insertion: int64,
            n_deletion: int64,
            n_transition: int64,
            n_transversion: int64,
            n_star: int64,
            r_ti_tv: float64,
            r_het_hom_var: float64,
            r_insertion_deletion: float64
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f709cfb00>
Index:
    ['column']
--------------------------------------------------------

Plotting the QC metrics

[33]:
p = hl.plot.histogram(mt.sample_qc.call_rate, range=(.88,1), legend='Call Rate')
show(p)
[34]:
p = hl.plot.histogram(mt.sample_qc.gq_stats.mean, range=(10,70), legend='Mean Sample GQ')
show(p)
[35]:
p = hl.plot.scatter(mt.sample_qc.dp_stats.mean, mt.sample_qc.call_rate, xlabel='Mean DP', ylabel='Call Rate')
show(p)
[36]:
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
print('After filter, %d/284 samples remain.' % mt.count_cols())
After filter, 250/284 samples remain.
[37]:
ab = mt.AD[1] / hl.sum(mt.AD)

filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt.GT.is_hom_var() & (ab >= 0.9)))

fraction_filtered = mt.aggregate_entries(hl.agg.fraction(~filter_condition_ab))
print(f'Filtering {fraction_filtered * 100:.2f}% entries out of downstream analysis.')
mt = mt.filter_entries(filter_condition_ab)
Filtering 3.64% entries out of downstream analysis.
[38]:
mt = hl.variant_qc(mt)
[39]:
mt.row.describe()
--------------------------------------------------------
Type:
        struct {
        locus: locus<GRCh37>,
        alleles: array<str>,
        rsid: str,
        qual: float64,
        filters: set<str>,
        info: struct {
            AC: array<int32>,
            AF: array<float64>,
            AN: int32,
            BaseQRankSum: float64,
            ClippingRankSum: float64,
            DP: int32,
            DS: bool,
            FS: float64,
            HaplotypeScore: float64,
            InbreedingCoeff: float64,
            MLEAC: array<int32>,
            MLEAF: array<float64>,
            MQ: float64,
            MQ0: int32,
            MQRankSum: float64,
            QD: float64,
            ReadPosRankSum: float64,
            set: str
        },
        variant_qc: struct {
            dp_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            gq_stats: struct {
                mean: float64,
                stdev: float64,
                min: float64,
                max: float64
            },
            AC: array<int32>,
            AF: array<float64>,
            AN: int32,
            homozygote_count: array<int32>,
            call_rate: float64,
            n_called: int64,
            n_not_called: int64,
            n_filtered: int64,
            n_het: int64,
            n_non_ref: int64,
            het_freq_hwe: float64,
            p_value_hwe: float64
        }
    }
--------------------------------------------------------
Source:
    <hail.matrixtable.MatrixTable object at 0x7f0f7097f390>
Index:
    ['row']
--------------------------------------------------------

GWAS Analysis

[40]:
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
[41]:
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)
[42]:
print('Samples: %d  Variants: %d' % (mt.count_cols(), mt.count_rows()))
Samples: 250  Variants: 7849
[43]:
gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[1.0])
gwas.row.describe()
--------------------------------------------------------
Type:
        struct {
        locus: locus<GRCh37>,
        alleles: array<str>,
        n: int32,
        sum_x: float64,
        y_transpose_x: float64,
        beta: float64,
        standard_error: float64,
        t_stat: float64,
        p_value: float64
    }
--------------------------------------------------------
Source:
    <hail.table.Table object at 0x7f0f7092e358>
Index:
    ['row']
--------------------------------------------------------
[44]:
p = hl.plot.manhattan(gwas.p_value)
show(p)
[45]:
p = hl.plot.qq(gwas.p_value)
show(p)

Confounded

[46]:
eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)
[47]:
pprint(eigenvalues)
[18.02377947184681,
 9.98894555036326,
 3.538312262917122,
 2.6577590783730023,
 1.5966032147658327,
 1.5416611649602379,
 1.5029872248781773,
 1.4720816378531194,
 1.4678188487330746,
 1.4477835201334939]
[48]:
pcs.show(5, width=100)
sscores
strarray<float64>
"HG00096"[-1.22e-01,-2.81e-01,1.11e-01,-1.28e-01,6.81e-02,-3.72e-03,-2.66e-02,4.99e-03,-9.33e-02,-1.48...
"HG00099"[-1.13e-01,-2.90e-01,1.08e-01,-7.04e-02,4.20e-02,3.33e-02,1.61e-02,-1.15e-03,3.29e-02,2.33e-02]
"HG00105"[-1.08e-01,-2.80e-01,1.03e-01,-1.05e-01,9.40e-02,1.27e-02,3.14e-02,3.08e-02,1.06e-02,-1.93e-02]
"HG00118"[-1.25e-01,-2.98e-01,7.21e-02,-1.07e-01,2.89e-02,8.09e-03,-4.70e-02,-3.32e-02,-2.59e-04,8.49e...
"HG00129"[-1.07e-01,-2.87e-01,9.72e-02,-1.16e-01,1.38e-02,1.87e-02,-8.37e-02,-4.87e-02,3.73e-02,2.11e-02]

showing top 5 rows

[49]:
mt = mt.annotate_cols(scores = pcs[mt.s].scores)
[50]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    label=mt.pheno.SuperPopulation,
                    title='PCA', xlabel='PC1', ylabel='PC2')
show(p)
[51]:
gwas = hl.linear_regression_rows(
    y=mt.pheno.CaffeineConsumption,
    x=mt.GT.n_alt_alleles(),
    covariates=[1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2]])
[52]:
p = hl.plot.qq(gwas.p_value)
show(p)
[53]:
p = hl.plot.manhattan(gwas.p_value)
show(p)

Rare variant analysis

[54]:
entries = mt.entries()
results = (entries.group_by(pop = entries.pheno.SuperPopulation, chromosome = entries.locus.contig)
      .aggregate(n_het = hl.agg.count_where(entries.GT.is_het())))
[55]:
results.show()
popchromosomen_het
strstrint64
"AFR""1"11276
"AFR""10"7160
"AFR""11"6875
"AFR""12"7048
"AFR""13"4678
"AFR""14"4313
"AFR""15"3904
"AFR""16"4593
"AFR""17"3718
"AFR""18"4171

showing top 10 rows

[56]:
entries = entries.annotate(maf_bin = hl.cond(entries.info.AF[0]<0.01, "< 1%",
                             hl.cond(entries.info.AF[0]<0.05, "1%-5%", ">5%")))

results2 = (entries.group_by(af_bin = entries.maf_bin, purple_hair = entries.pheno.PurpleHair)
      .aggregate(mean_gq = hl.agg.stats(entries.GQ).mean,
                 mean_dp = hl.agg.stats(entries.DP).mean))
[57]:
results2.show()
af_binpurple_hairmean_gqmean_dp
strboolfloat64float64
"1%-5%"false2.48e+017.43e+00
"1%-5%"true2.46e+017.47e+00
"< 1%"false2.35e+017.55e+00
"< 1%"true2.35e+017.53e+00
">5%"false3.70e+017.65e+00
">5%"true3.73e+017.70e+00

Citation

[58]:
hl.citation()
[58]:
'Hail Team. Hail 0.2.38-9d0be8b84f35. https://github.com/hail-is/hail/commit/9d0be8b84f35.'
[ ]: