# use a tabfor i inrange(3):print(i)# use 2 spacesfor i inrange(3):print(i)# use 4 spacesfor i inrange(3):print(i)
Best practice: always use 4 spaces. You can set whether to use spaces(soft tabs) or tabs for indentation.
In vim editor, use :set list to inspect incorrect number of spaces/tabs.
Add Shebang and encoding at the beginning of executable scripts
Create a file named welcome.py
#! /usr/bin/env python# -*- coding: UTF-8 -*-print('welcome to python!')
Then set the python script as executable:
chmod+xwelcome.py
Now you can run the script without specifying the Python interpreter:
./welcome.py
All variables, functions, classes are dynamic objects
classMyClass():def__init__(self,name): self.name = name# assign an integer to aa =1print(type(a))# assign a string to aa ='abc'print(type(a))# assign a function to aa =rangeprint(type(a))print(a(10))# assign a class to aa = MyClassprint(type(a))b =a('myclass')print(b.name)# assign an instance of a class to aa =MyClass('myclass')print(b.name)# get type of aprint(type(a))
All python variables are pointers/references
a = [1,2,3]print('a = ', a)# this add another refrence to the listb = aprint('b = ', b)# this will change contents of both a and bb[2]=4print('a = ', a)print('b = ', b)
Use deepcopy if you really want to COPY a variable
from copy import deepcopya ={'A': [1],'B': [2],'C': [3]}print(a)# shallow copyb =dict(a)# modify elements of b will change contents of ab['A'].append(2)print('a = ', a)print('b = ', b)# this also does not workc ={k:v for k, v in a}c['A'].append(3)print('a = ', a)print('c = ', c)# recurrently copy every object of ad =deepcopy(a)# modify elements of c will not change contents of ad['A'].append(2)print('a = ', a)print('d = ', d)
What if I accidentally overwrite my builtin functions?
A = [1,2,3,4]# Ops! the builtin function sum is overwritten by a numbersum=sum(A)# this will raise an error because sum is not a function nowprint(sum(A))# recover the builtin function into the current environmentfrom __builtin__ importsum# this works because sum is a functionprint(sum(A))
Note: in Python 3, you should import from builtins rather than __builtin__
from builtins importsum
int is of arbitrary precision in Python!
In Pyhton:
print(2**10000)
In R:
print(2^10000)
Easiest way to swap values of two variables
In C/C++:
int a =1, b =2, t;t = a;a = b;b = t;
In Python:
a =1b =2b, a = a, bprint(a, b)
List comprehension
Use for-loops:
a = []for i inrange(10): a.append(i +10)print(a)
Use list comprehension
a = [i +10for i inrange(10)]print(a)
Dict comprehension
Use for-loops:
a ={}for i inrange(10): a[i]=chr(ord('A') + i)print(a)
Use dict comprehension:
a ={i:chr(ord('A') + i)for i inrange(10)}print(a)
For the one-liners
Use ';' instead of '\n':
# print the first column of each linepython-c'import sys; print("\n".join(line.split("\t")[0] for line in sys.stdin))'
import sys# read line by linefor line in sys.stdin:print(line)
Order of dict keys are NOT as you expected
a ={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6}# not in lexicographical orderprint([key for key in a])# now in lexicographical orderprint([key for key insorted(a)])
Use enumerate() to add a number during iteration
A = ['a','b','c','d']for i, a inenumerate(A):print(i, a)
a ='ABCDF'# will raise an Errora[4]='E'# convert str to bytearrayb =bytearray(a)# bytearray are mutableb[4]='E'# convert bytearray to strprint(str(b))
tuples are hashable while lists are not hashable
# create dict using tuples as keysd ={ ('chr1',1000,2000):'featureA', ('chr1',2000,3000):'featureB', ('chr1',3000,4000):'featureC', ('chr1',4000,5000):'featureD', ('chr1',5000,6000):'featureE', ('chr1',6000,7000):'featureF'}# query the dict using tuplesprint(d[('chr1', 3000, 4000)])print(d[('chr1', 6000, 7000)])# will raise an errord ={['chr1',1000,2000]:'featureA'}
Use itertools
Nested loops in a more concise way:
A = [1,2,3]B = ['a','b','c']C = ['i','j','k']D = ['x','y','z']# Use nested for-loopsfor a in A:for b in B:for c in C:for d in D:print(a, b, c, d)# Use itertools.productimport itertoolsfor a, b, c, d in itertools.product(A, B, C, D):print(a, b, c, d)
Get all combinations of a list:
A = ['A','B','C','D']# Use itertools.combinationsimport itertoolsfor a, b, c in itertools.combinations(A, 3):print(a, b, c)
Convert iterables to lists
import itertoolsA = [1,2,3]B = ['a','b','c']a = itertools.product(A, B)# a is a iterable rather than a listprint(a)# a is a list nowa =list(a)print(a)
Use the zip() function to transpose nested lists/tuples/iterables
records = [ ('chr1',1000,2000), ('chr1',2000,3000), ('chr1',3000,4000), ('chr1',4000,5000), ('chr1',5000,6000), ('chr1',6000,7000)]# iterate by rowsfor chrom, start, end in records:print(chrom, start, end)# extract columnschroms, starts, ends =zip(*records)# build records from columns# now records2 is the same as recordsrecords2 =zip(chroms, starts, ends)print(records)
Global and local variables
# a is globala =1defprint_local():# a is local a =2print(a)defprint_global():# a is globalglobal a a =2print(a)# print global variableprint(a)# print local variable from functionprint_local()# a is unchangedprint(a)# change and print global from functionprint_global()# a is changedprint(a)
Use defaultdict
Use dict:
d ={}d['a']= []d['b']= []d['c']= []# extend list with new elementsd['a']+= [1,2]d['b']+= [3,4,5]d['c']+= [6]for key, val in d.items():print(key, val)
Use defaultdict:
from collections import defaultdict# a new list is created automatically when new elements are addedd =defaultdict(list)# extend list with new elementsd['a']+= [1,2]d['b']+= [3,4,5]d['c']+= [6]for key, val in d.items():print(key, val)
Use generators
Example: read a large FASTA file
defappend_extra_line(f):"""Yield an empty line after the last line in the file """for line in f:yield lineyield''defread_fasta(filename):withopen(filename, 'r')as f: name =None seq =''for line inappend_extra_line(f):if line.startswith('>')or (len(line)==0):if (len(seq)>0) and (name isnotNone):yield (name, seq)if line.startswith('>'): name = line.strip()[1:].split()[0] seq =''else:if name isNone:raiseValueError('the first line does not start with ">"') seq += line.strip()# print sequence name and length of each for name, seq inread_fasta('test.fa'):print(name, len(seq))
Turn off annoying KeyboardInterrupt and BrokenPipe Error
import numpy as np# create an empty matrix of shape (5, 4)X = np.zeros((5, 4), dtype=np.int32)# create an array of length 5: [0, 1, 2, 3, 4]y = np.arange(5)# create an array of length 4: [0, 1, 2, 3]z = np.arange(4)# set Row 1 to [0, 1, 2, 3]X[0]= np.arange(4)# set Row 2 to [1, 1, 1, 1]X[1]=1# add 1 to all elementsX +=1# add y to each row of XX += y.reshape((-1, 1))# add z to each column of XX += z.reshape((1, -1))# get row sums => row_sums = X.sum(axis=1)# get column sumscol_sums = X.sum(axis=0)# matrix multiplicationA = X.dot(X.T)# save matrix to text filenp.savetxt('data.txt', A)
Numerical analysis (probability distribution, signal processing, etc.): scipy
Compile python for-loops to native code to achive similar performance to C/C++ code. Example:
from numba import jitfrom numpy import arange# jit decorator tells Numba to compile this function.# The argument types will be inferred by Numba when function is called.@jitdefsum2d(arr): M, N = arr.shape result =0.0for i inrange(M):for j inrange(N): result += arr[i,j]return resulta =arange(9).reshape(3,3)print(sum2d(a))
import pandas as pd# read a bed filegenes = pd.read_table('gene.bed', header=None, sep='\t', names=('chrom', 'start', 'end', 'gene_id', 'score', 'strand', 'biotype'))# get all gene IDsgene_ids = genes['gene_id']# set gene_id as indexgenes.index = genes['gene_id']# get row with given gene_idgene = genes.loc['ENSG00000212325.1']# get rows with biotype = 'protein_coding'genes_selected = genes[genes['biotype']=='protein_coding']]# get protein coding genes in chr1genes_selected = genes.query('(biotype == "protein_coding") and (chrom == "chr1")')# count genes for each biotypebiotype_counts = genes.groupby('biotype')['gene_id'].count()# add a column for gene lengthgenes['length']= genes['end']- genes['start']# calculate average gene length for each chromosome and biotypelength_table = genes.pivot_table(values='length', index='biotype', columns='chrom')# save DataFrame to Excel filelength_table.to_excel('length_table.xlsx')
from sklearn.datasets import make_classificationfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import roc_auc_score, accuracy_score# generate ramdom dataX, y =make_classification(n_samples=1000, n_classes=2, n_features=10)# split dataset into training and test datasetX_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2)# create an classifier objectmodel =LogisticRegression()# training the classifiermodel.fit(X_train, y_train)# predict outcomes on the test datasety_pred = model.predict(X_test)# evalualte the classification performanceprint('roc_auc_score = %f'%roc_auc_score(y_test, y_pred))print('accuracy_score = %f'%accuracy_score(y_test, y_pred))
File accession File format Output type Experiment accession Assay Biosample term id
ENCFF983DFB fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF590TBW fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF258RWG bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF468LRV bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF216EBS bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF232QFN bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF682NGE bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF328UKA bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF165COO bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF466OLG bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF595HIY bigBed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
ENCFF494CKB bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF308BXW bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF368IHM bed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
Now display the table more clearly:
head-n15metadata.tsv|tvi-d$'\t'-jcenter
Output:
File accession File format Output type Experiment accession Assay Biosample term id
ENCFF983DFB fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF590TBW fastq reads ENCSR429XTR ChIP-seq EFO:0002067
ENCFF258RWG bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF468LRV bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF216EBS bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF232QFN bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF682NGE bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF328UKA bam unfiltered alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF165COO bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF466OLG bam alignments ENCSR429XTR ChIP-seq EFO:0002067
ENCFF595HIY bigBed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
ENCFF494CKB bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF308BXW bigWig fold change over control ENCSR429XTR ChIP-seq EFO:0002067
ENCFF368IHM bed narrowPeak peaks ENCSR429XTR ChIP-seq EFO:0002067
You can also get some help by typing tvi -h:
usage: tvi [-h] [-d DELIMITER] [-j {left,right,center}] [-s SEPARATOR]
[infile]
Print tables pretty
positional arguments:
infile input file, default is stdin
optional arguments:
-h, --help show this help message and exit
-d DELIMITER delimiter of fields of input. Default is white space.
-j {left,right,center}
justification, either left, right or center. Default
is left
-s SEPARATOR separator of fields in output
tvi.py
#! /usr/bin/env pythonimport sysimport argparseimport osfrom cStringIO import StringIOdefmain(): parser = argparse.ArgumentParser(description='Print tables pretty') parser.add_argument('infile', type=str, nargs='?', help='input file, default is stdin') parser.add_argument('-d', dest='delimiter', type=str, required=False, help='delimiter of fields of input. Default is white space.') parser.add_argument('-j', dest='justify', type=str, required=False, default='left', choices=['left', 'right', 'center'], help='justification, either left, right or center. Default is left') parser.add_argument('-s', dest='separator', type=str, required=False, default=' ', help='separator of fields in output') args = parser.parse_args() table = [] maxwidth = []# default is to read from stdin fin = sys.stdinif args.infile:try: fin =open(args.infile, 'rt')exceptIOErroras e: sys.stderr.write('Error: %s: %s\n'%(e.strerror, args.infile)) sys.exit(e.errno)for line in fin: fields =None# split line by delimiterif args.delimiter: fields = line.strip().split(args.delimiter)else: fields = line.strip().split()for i inxrange(len(fields)): width =len(fields[i])if (i+1) >len(maxwidth): maxwidth.append(width)else:if width > maxwidth[i]: maxwidth[i]= width table.append(fields) fin.close()try:for fields in table: line =StringIO()for i inxrange(len(fields)):# format field with different justification nSpace = maxwidth[i]-len(fields[i])if args.justify =='left': line.write(fields[i])for j inxrange(nSpace): line.write(' ')elif args.justify =='right':for j inxrange(nSpace): line.write(' ') line.write(fields[i])elif args.justify =='center':for j inxrange(nSpace/2): line.write(' ') line.write(fields[i])for j inxrange(nSpace - nSpace/2): line.write(' ') line.write(args.separator)print line.getvalue() line.close()exceptIOError: sys.exit(-1)if__name__=='__main__':main()
Generate a random FASTA file
seqgen.py
#! /usr/bin/env pythonimport sysimport argparseimport textwrapimport randomdefwrite_fasta(fout,seq,name='seq',description=None):if description: fout.write('>'+ name +' '+ description +'\n')else: fout.write('>'+ name +'\n') fout.write(textwrap.fill(seq) +'\n')defmain(): parser = argparse.ArgumentParser(description='Generate sequences and output in various formats') parser.add_argument('-n', '--number', dest='number', type=int, required=False, default=10, help='Number of sequences to generate') parser.add_argument('--min-length', dest='min_length', type=int, required=False, default=30, help='Minimal length') parser.add_argument('--max-length', dest='max_length', type=int, required=False, default=50, help='Maximal length') parser.add_argument('-l', '--length', type=int, required=False, help='Fixed length. If specified, --min-length and --max-length will be ignored.') parser.add_argument('-a', '--alphabet', type=str, required=False, default='ATGC', help='Letters to used in the sequences') parser.add_argument('-f', '--format', type=str, required=False, choices=['fasta', 'text'], default='fasta', help='Output formats') parser.add_argument('-o', '--outfile', type=argparse.FileType('w'), required=False, default=sys.stdout, help='Output file name') parser.add_argument('-p', '--prefix', type=str, required=False, default='RN_', help='Prefix of sequence names for fasta format') args = parser.parse_args() rand = random.Random()for i inxrange(args.number):if args.length: length = args.lengthelse: length = rand.randint(args.min_length, args.max_length) seq =bytearray(length)for j inxrange(length): seq[j]= rand.choice(args.alphabet)if args.format =='fasta':write_fasta(args.outfile, str(seq), args.prefix +'%08d'%i)else: args.outfile.write(seq +'\n') args.outfile.close()if__name__=='__main__':main()
Weekly tasks
All files you need for completing the tasks can be found at: weekly_tasks.zip
Task 1: run examples (Python tips, numpy, pandas) in this tutorial
Install Anaconda on your PC. Try to understand example code and run in Jupyter or IPython.
Task 2: write a Python program to convert a GTF file to BED12 format