def embed_pdf_figure(width='640', height='480', title='Image'):
data = BytesIO()
plt.savefig(data, format='pdf', metadata={'Title': title})
data = data.getvalue()
data = 'data:application/pdf;base64,'+ str(b64encode(data), encoding='utf-8')
display(HTML('<object width="{}" height="{}" data="{}" download="{}.pdf"></object>'.format(
width, height, data, title)))
plt.close()
from matplotlib.backends.backend_pdf import PdfPages, PdfFile
from IPython.display import HTML, display, FileLink
from base64 import b64encode, b64decode
from io import StringIO, BytesIO
from contextlib import contextmanager
@contextmanager
def embed_pdf_pages(width=960, height=480, title='Image'):
data = BytesIO()
try:
pdf = PdfPages(data, metadata={'Title': title})
yield pdf
finally:
pdf.close()
data = data.getvalue()
data = 'data:application/pdf;base64,'+ str(b64encode(data), encoding='utf-8')
display(HTML('<object width="{}" height="{}" data="{}"></object>'.format(width, height, data)))
plt.close()
@contextmanager
def embed_pdf_data(width=640, height=480, title='Image'):
try:
data = BytesIO()
yield data
finally:
data = data.getvalue()
data = 'data:application/pdf;base64,'+ str(b64encode(data), encoding='utf-8')
display(HTML('<object width="{}" height="{}" data="{}"></object>'.format(width, height, data)))
plt.close()
def gradient_func(val):
return '<span style="background: linear-gradient(90deg, #d65f5f {0}%, transparent 0%)">{0:.3f}</span>'.format(val)
def display_dataframe(df, filename=None, encoding='utf-8', format='csv', type='button',gradientfunc=False, **kwargs):
#display(df)
#if isinstance(df, pd.DataFrame):
# display(df.style.set_caption(filename))
#else:
if gradientfunc == False:
display(df.style.set_caption(filename))
else:
display(df.style.format(gradient_func).set_caption(filename))
if filename is None:
filename = "dataframe"
if format == 'csv':
data = df.to_csv(**kwargs)
mime_type = 'text/csv'
filename = filename + '.csv'
elif format == 'tsv':
data = df.to_csv(**kwargs)
mime_type = 'text/plain'
filename = filename + '.txt'
else:
raise ValueError('unknown file format: {}'.format(format))
data = 'data:{mime_type};base64,'.format(mime_type=mime_type) + str(b64encode(bytes(data, encoding=encoding)), encoding=encoding)
if type == 'hyperlink':
display(HTML('<a href=" " download={filename} target="_blank">{filename}</a >'.format(
mime_type=mime_type, filename=filename, data=data)))
elif type == 'button':
button_id = 'button_{}'.format(np.random.randint(1000000000))
display(HTML(r'<input type="button" id="{0}" value="Download">'.format(button_id)))
display(HTML('''<script>
document.getElementById("{button_id}").addEventListener("click", function(event){{
var filename = "{filename}";
var data = "{data}";
const element = document.createElement('a');
element.setAttribute('href', data);
element.setAttribute('download', filename);
element.style.display = 'none';
document.body.appendChild(element);
element.click();
document.body.removeChild(element);
}});
</script>'''.format(button_id=button_id, filename=filename, data=data)))
def log_transfrom(data,small=0.01):
return np.log2(data+small)
fontsize = 6.5
fontscale = 1
fontweight = 'normal'
fonttitle = {'family':'Arial',
'weight' : fontweight,
'size' : fontsize*fontscale}
fontlabel = {'family':'Arial',
'weight' : fontweight,
'size' : fontsize*fontscale}
fontticklabel = {'family':'Arial',
'weight' : fontweight,
'size' : fontsize*fontscale}
fontlegend = {'family':'Arial',
'weight' : fontweight,
#'linewidth':0.5,
'size' : fontsize*fontscale}
fontcbarlabel = {'family':'Arial',
'weight' : fontweight,
#'Rotation' : 270,
#'labelpad' : 25,
'size' : fontsize*fontscale}
fontcbarticklabel = {'family':'Arial',#Helvetica
'weight' : fontweight,
'size' : (fontsize-1)*fontscale}
def std_plot(ax,xlabel=None,ylabel=None,title=None,
legendtitle=None,bbox_to_anchor=None,
labelspacing=0.2,borderpad=0.2,handletextpad=0.2,legendsort=False,markerscale=None,
xlim=None,ylim=None,
xbins=None,ybins=None,
cbar=None,cbarlabel=None,
moveyaxis=False,sns=False,left=True,rotation=None,xticklabel=None,legendscale=True,h=None,l=None,row=1,legend_adj=True,**kwards):
# height = 2 font = 6.5
def autoscale(fig):
if isinstance(fig,matplotlib.figure.Figure):
width,height = fig.get_size_inches()
elif isinstance(fig,matplotlib.axes.Axes):
width,height = fig.figure.get_size_inches()
fontscale = height/(2*row)
if width/fontscale > 8:
warnings.warn("Please reset fig's width. When scaling the height to 2 in, the scaled width '%.2f' is large than 8"%(width/fontscale),UserWarning)
return fontscale
class fontprop:
def init(self,fonttitle=None,fontlabel=None,fontticklabel=None,fontlegend=None,fontcbarlabel=None,fontcbarticklabel=None):
self.fonttitle = fonttitle
self.fontlabel = fontlabel
self.fontticklabel = fontticklabel
self.fontlegend = fontlegend
self.fontcbarlabel = fontcbarlabel
self.fontcbarticklabel = fontcbarticklabel
def update(self,fontscale):
self.fonttitle['size'] = self.fonttitle['size']*fontscale
self.fontlabel['size'] = self.fontlabel['size']*fontscale
self.fontticklabel['size'] = self.fontticklabel['size']*fontscale
self.fontlegend['size'] = self.fontlegend['size']*fontscale
self.fontcbarlabel['size'] = self.fontcbarlabel['size']*fontscale
self.fontcbarticklabel['size'] = self.fontcbarticklabel['size']*fontscale
def reset(self,fontscale):
self.fonttitle['size'] = self.fonttitle['size']/fontscale
self.fontlabel['size'] = self.fontlabel['size']/fontscale
self.fontticklabel['size'] = self.fontticklabel['size']/fontscale
self.fontlegend['size'] = self.fontlegend['size']/fontscale
self.fontcbarlabel['size'] = self.fontcbarlabel['size']/fontscale
self.fontcbarticklabel['size'] = self.fontcbarticklabel['size']/fontscale
fontscale = autoscale(ax)
font = fontprop()
font.init(fonttitle,fontlabel,fontticklabel,fontlegend,fontcbarlabel,fontcbarticklabel)
font.update(fontscale)
pyplot.draw()
#plt.figure(linewidth=30.5)
if xlim is not None:
ax.set(xlim=xlim)
if ylim is not None:
ax.set(ylim=ylim)
#pyplot.draw()
if xbins is not None:
locator = MaxNLocator(nbins=xbins)
locator.set_axis(ax.xaxis)
ax.set_xticks(locator())
if ybins is not None:
locator = MaxNLocator(nbins=ybins)
locator.set_axis(ax.yaxis)
ax.set_yticks(locator())
pyplot.draw()
ax.set_xticks(ax.get_xticks())
ax.set_yticks(ax.get_yticks())
ax.set_xlabel(xlabel,fontdict = font.fontlabel,labelpad=(fontsize-1)*fontscale)
ax.set_ylabel(ylabel,fontdict = font.fontlabel,labelpad=(fontsize-1)*fontscale)
if (rotation is not None) & (xticklabel is not None) :
ax.set_xticklabels(xticklabel,fontticklabel,rotation=rotation)
elif (xticklabel is not None) &(rotation is None):
ax.set_xticklabels(xticklabel,fontticklabel)
elif (xticklabel is None) &(rotation is None):
ax.set_xticklabels(ax.get_xticklabels(),fontticklabel)
elif (rotation is not None) & (xticklabel is None):
ax.set_xticklabels(ax.get_xticklabels(),fontticklabel,rotation=rotation)
ax.set_yticklabels(ax.get_yticklabels(),font.fontticklabel)
if moveyaxis is True:
#fontticklabel
ax.spines['left'].set_position(('data',0))
ax.spines['left'].set_visible(left)
ax.spines['right'].set_visible(not left)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_linewidth(0.5*fontscale)
ax.spines['bottom'].set_linewidth(0.5*fontscale)
ax.spines['left'].set_linewidth(0.5*fontscale)
ax.spines['bottom'].set_color('k')
ax.spines['left'].set_color('k')
ax.spines['right'].set_color('k')
ax.tick_params(direction='out', pad=2*fontscale,width=0.5*fontscale)
#ax.spines['bottom']._edgecolor="#000000"
#ax.spines['left']._edgecolor="#000000"
if title is not None:
ax.set_title(title,fontdict = font.fonttitle)
if legendscale is True:
if (h is None)&(l is None):
legend = ax.legend(prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
else:
legend = ax.legend(h,l,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
if legendtitle is not None:
#if legendloc is None:
# legendloc="best"
legend = ax.legend(title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
if legendsort is True:
# h: handle l:label
h,l = ax.get_legend_handles_labels()
l,h = zip(*sorted(zip(l,h), key=lambda t: int(t[0])))
legend = ax.legend(h,l,title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
if sns is True:
h,l = ax.get_legend_handles_labels()
#l,h = zip(*sorted(zip(l,h), key=lambda t: int(t[0])))
legend = ax.legend(h[1:],l[1:],title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
elif legend_adj is True:
legend = ax.legend(handles=h,labels=l,title=legendtitle,prop=font.fontlegend,
bbox_to_anchor=bbox_to_anchor,
labelspacing=labelspacing,borderpad=borderpad,handletextpad=handletextpad,
edgecolor="#000000",fancybox=False,markerscale=markerscale,**kwards)
ax.legend_.get_frame()._linewidth=0.5*fontscale
legend.get_title().set_fontweight('normal')
legend.get_title().set_fontsize(fontscale*fontsize)
if cbar is not None:
#locator, formatter = cbar._get_ticker_locator_formatter()
#ticks, ticklabels, offset_string = cbar._ticker(locator, formatter)
#cbar.ax.spines['top'].set_visible(False)
#cbar.ax.spines['right'].set_visible(False)
#cbar.ax.spines['bottom'].set_visible(False)
#cbar.ax.spines['left'].set_visible(False)
cbar.ax.tick_params(direction='out', pad=3*fontscale,width=0*fontscale,length=0*fontscale)
cbar.set_label(cbarlabel,fontdict = font.fontcbarlabel,Rotation=270,labelpad=fontscale*(fontsize+1))
cbar.ax.set_yticks(cbar.ax.get_yticks())
cbar.ax.set_yticklabels(cbar.ax.get_yticklabels(),font.fontcbarticklabel)
font.reset(fontscale)
return ax
#setup figure template
figure_template_path = 'bin'
if figure_template_path not in sys.path:
sys.path.append(figure_template_path)
from importlib import reload
import figure_template
#force reload of the module
reload(figure_template)
from figure_template import display_dataframe, embed_pdf_figure, embed_pdf_pages,std_plot
# use a tab
for i in range(3):
print(i)
# use 2 spaces
for i in range(3):
print(i)
# use 4 spaces
for i in range(3):
print(i)
print("The \n makes a new line")
print("The \t is a tab")
print('I\'m going to the movies')
import numpy as np
a = np.array([1, 2, 3]) # Create a rank 1 array
print(type(a)) # Prints "<class 'numpy.ndarray'>"
print(a.shape) # Prints "(3,)"
print(a[0], a[1], a[2]) # Prints "1 2 3"
a[0] = 5 # Change an element of the array
print(a) # Prints "[5, 2, 3]"
b = np.array([[1,2,3],[4,5,6]]) # Create a rank 2 array
print(b.shape) # Prints "(2, 3)"
print(b[0, 0], b[0, 1], b[1, 0]) # Prints "1 2 4"
a = np.zeros((2,2)) # Create an array of all zeros
print(a) # Prints "[[ 0. 0.]
# [ 0. 0.]]"
b = np.ones((1,2)) # Create an array of all ones
print(b) # Prints "[[ 1. 1.]]"
c = np.full((2,2), 7) # Create a constant array
print(c) # Prints "[[ 7. 7.]
# [ 7. 7.]]"
d = np.eye(2) # Create a 2x2 identity matrix
print(d) # Prints "[[ 1. 0.]
# [ 0. 1.]]"
e = np.random.random((2,2)) # Create an array filled with random values
print(e) # Might print "[[ 0.91940167 0.08143941]
# [ 0.68744134 0.87236687]]"
x = np.linspace(0, 2 * np.pi, 50)
plt.plot(x, np.sin(x))
#use ax and figure
fig,ax=plt.subplots(figsize=(8,3))
x = np.linspace(0, 2 * np.pi, 50)
ax.plot(x, np.sin(x))
#use ax and figure
fig,ax=plt.subplots(2,3,figsize=(18,6))
for i in range(2):
for j in range(3):
x = np.linspace(0, 2 * np.pi * (i*3+j+1), 50)
ax[i,j].plot(x, np.sin(x))
fig,ax=plt.subplots(figsize=(4,4))
x = np.random.rand(100)
y = np.random.rand(100)
size = np.random.rand(100) * 50
colour = np.random.rand(100)
scatter = ax.scatter(x, y, size, colour)
fig.colorbar(scatter)
fig,ax=plt.subplots(figsize=(6,3))
x = np.random.randn(1000)
ax.hist(x, 50)
import numpy as np
import pandas as pd
np.random.seed(44)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
# Let us also get tableau colors we defined earlier:
tableau_20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
# Scaling above RGB values to [0, 1] range, which is Matplotlib acceptable format:
for i in range(len(tableau_20)):
r, g, b = tableau_20[i]
tableau_20[i] = (r / 255., g / 255., b / 255.)
widgets.SelectMultiple(
options=['Ubuntu', 'CentOS','RedHat','Raspberry','Windows10', 'Mac OS Majove'],
description='OS:',
)
import datetime
dates = [datetime.date(2015,i,1) for i in range(1,13)]
options = [(i.strftime('%b'), i) for i in dates]
widgets.SelectionRangeSlider(
options=options,
index=(0,11),
description='Months (2015)',
disabled=False
)
def plot_bar_by_rna(fig,ax,table,rnaname,savefig=1, statistics = 'ratio',height = 4, width=20):
'''
table: ratio or count table, rows are rna type
statistics: ratio or count
'''
table = table.T
count = np.array(table[table.index ==rnaname]).ravel()
#fig,ax=plt.subplots(1,figsize=(width,height))
counttable = pd.DataFrame(np.concatenate((np.arange(1,table.shape[1]+1).reshape(-1,1),
count[np.argsort(-count)].reshape(-1,1)),axis=1),columns=['sample',statistics])
sns.barplot(ax=ax,x='sample',y=statistics,data = counttable,color=Category20c[20][np.random.randint(0,20)],alpha=1)
ax.set_xticks(np.arange(0,table.shape[1],5))
ax.set_xticklabels(np.arange(0,table.shape[1],5))
ax.set_title(statistics+' of '+rnaname,fontsize=15)
if savefig:
fig.savefig(save_path+'sample_'+rnaname+'_'+statistics+'_bar_plot'+saveformat, bbox_inches='tight')
def plot_bar_by_rna_total(table,datatype='ratio'):
fignum= table.columns.shape[0]
fig,ax=plt.subplots(fignum ,1,figsize=(7 , 24 ))
for i in range(fignum):
plot_bar_by_rna(fig,ax[i],table,table.columns[i],statistics = datatype)
std_plot(ax[i],'sample','ratio',datatype+' of '+table.columns[i])
fig.tight_layout()
def volcano_plot():
for compare_group in compare_group_list:
if np.isin( compare_group,compare_list_use):
detable = pd.read_table('output/'+dataset+'/differential_expression/'+exp_mx_name+'/'+compare_group+'/deseq2.txt'
,index_col=0)
de_plot_mx = pd.DataFrame(data={'feature':[name.split('|')[0] for name in detable.index.values],
'log2FoldChange':detable['log2FoldChange'].tolist(),
'padj':detable['padj'].tolist()})
de_plot_mx.set_index('feature',inplace=True)
de_plot_mx['threshold'] = (abs(de_plot_mx['log2FoldChange'])>1) & (de_plot_mx['padj']<0.05)
de_plot_mx['-log10(q values)'] = [-math.log10(qvalue) for qvalue in de_plot_mx['padj'].tolist()]
#de_plot_mx.iloc[np.where(de_plot_mx['threshold']==True)]
de_plot_mx['color'] = de_plot_mx['threshold']
for i in np.where(de_plot_mx['threshold']==True):
de_plot_mx['color'][i]='#DA706F'
for i in np.where(de_plot_mx['threshold']==False):
de_plot_mx['color'][i]='#5876B9'
fig,ax=plt.subplots(figsize=(4,4))
ax.scatter(de_plot_mx['log2FoldChange'], de_plot_mx['-log10(q values)'], c=de_plot_mx['color'],s=10,alpha=0.8,edgecolors='none')
ax.vlines(-1,-1,10,linewidth=0.5)
ax.vlines(1,-1,10,linewidth=0.5)
ax.hlines(-math.log10(0.05),-4,4,linewidth=0.5)
std_plot(ax,'log2FoldChange','-log10(q values)','volcano plot of '+compare_group,ylim=[-2,18])
ax.tick_params(direction='out', pad=2)
fig.tight_layout()
fig.savefig(save_path+'volcano_plot_of_'+compare_group+saveformat)
#embed_pdf_figure()
volcano_plot()
def heamap_plot(mat,sample_class=sample_classes):
if sequencing_type=='short':
norm_mx = mat/mat.sum(axis=0)*10e6
norm_mx = (norm_mx+0.01).apply(np.log2,0)
if sequencing_type=='long':
norm_mx = mat/mat.sum(axis=0)*10e6
length = np.array([mat.index[i].split('|')[-1] for i in range(mat.index.shape[0])]).astype('int')-\
np.array([mat.index[i].split('|')[-2] for i in range(mat.index.shape[0])]).astype('int')
norm_mx = (norm_mx.T/length).T*1000
norm_mx = (norm_mx+0.01).apply(np.log2,0)
for compare_group in tqdm(compare_group_list):
if np.isin( compare_group,compare_list_use):
if dataset=='scirep':
if compare_group=='Normal-CRC_S1':
class_compare = np.array([ 'Colorectal Cancer Stage 1', 'Healthy Control'])
if compare_group=='Normal-CRC_S1':
class_compare = np.array([ 'Colorectal Cancer Stage 1', 'Healthy Control'])
if compare_group=='Normal-CRC_S1':
class_compare = np.array([ 'Colorectal Cancer Stage 1', 'Healthy Control'])
if compare_group=='Normal-CRC_S1':
class_compare = np.array([ 'Colorectal Cancer Stage 1', 'Healthy Control'])
if compare_group == 'Normal-CRC':
class_select = ['Healthy Control','Colorectal Cancer']
if compare_group == 'Normal-PAAD':
class_select = ['Healthy Control','Pancreatic Cancer']
if compare_group == 'Normal-PRAD':
class_select = ['Healthy Control','Prostate Cancer']
elif dataset=='exorbase':
if compare_group == 'Normal-CRC':
class_select = ['Healthy','CRC']
if compare_group == 'Normal-HCC':
class_select = ['Healthy','HCC']
if compare_group == 'Normal-PAAD':
class_select = ['Healthy','PAAD']
elif dataset=='lulab_hcc':
if compare_group == 'Normal-HCC':
class_select = ['Normal','HCC']
sample_class = sample_classes
if compare_group == 'Normal-stage_A':
class_select = ['Normal','stage_A']
sample_class = sample_class_stage
elif dataset=='pico_3v3':
if compare_group == 'Normal-CRC':
class_select = ['Control','CRC']
print (compare_group)
norm_mx_delete = norm_mx
norm_mx_delete = norm_mx_delete.loc[:,np.array(sample_class.iloc[np.where(np.isin(sample_class['label'],class_select)==1)[0]].index)]
norm_z_mx = norm_mx_delete.apply(scipy.stats.zscore,1)
detable = pd.read_table('output/'+dataset+'/differential_expression/'+exp_mx_name+'/'+compare_group+'/deseq2.txt'
,index_col=0)
de_plot_mx = pd.DataFrame(data={'feature':[name.split('|')[0] for name in detable.index.values],
'log2FoldChange':detable['log2FoldChange'].tolist(),
'padj':detable['padj'].tolist()})
de_plot_mx.set_index('feature',inplace=True)
de_plot_mx['threshold'] = (abs(de_plot_mx['log2FoldChange'])>1) & (de_plot_mx['padj']<0.05)
de_plot_mx['-log10(q values)'] = [-math.log10(qvalue) for qvalue in de_plot_mx['padj'].tolist()]
detable['-log10(q values)']=-np.log10(detable['padj'])
detable['metrics']=detable['-log10(q values)']*np.abs(detable['log2FoldChange'])
de_mx = pd.DataFrame(norm_z_mx).loc[detable.sort_values('metrics',ascending=False).iloc[np.where(de_plot_mx.sort_values('-log10(q values)',ascending=False)['threshold']==True)].index]
type_select =pd.DataFrame(sample_class.loc[de_mx.columns])
column_colors = np.zeros(type_select.shape[0]).astype('str')
column_colors[np.where(type_select.label==class_select[0])] = np.array(["#DA706F"])
column_colors[np.where(type_select.label==class_select[1])] = np.array(["#5876B9"])
tmpind = de_mx.index
tmpind = np.array([tmpind[i].split('|')[0]+'|'+tmpind[i].split('|')[1]+'|'+tmpind[i].split('|')[2] for i in range(tmpind.shape[0])])
de_mx.index = tmpind
de_mx = de_mx.fillna(0)
g = sns.clustermap(de_mx.iloc[:20], row_cluster=False,cmap="vlag",
col_colors=column_colors,linewidths=.005,vmax=3,vmin=-3)#,z_score=0)
g.savefig(save_path+compare_group+'_DE_heatmap'+saveformat)
#with embed_pdf_data() as data:
# g.savefig(data, format='pdf', metadata={'Title': compare_group})
heamap_plot(original_mx)
cpm_table_origin_ = pd.read_table('output/'+dataset+'/count_matrix/'+exp_mx_name+'.txt',index_col=0)
cpm_table_origin = cpm_table_origin_/cpm_table_origin_.sum(axis=0)*10e6
length_tmp = np.array([cpm_table_origin.index[i].split('|')[-1] for i in range(cpm_table_origin.index.shape[0])]).astype('int')-\
np.array([cpm_table_origin.index[i].split('|')[-2] for i in range(cpm_table_origin.index.shape[0])]).astype('int')
rpkm_table_origin = (cpm_table_origin.T/length_tmp*1000).T
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
viridisBig = cm.get_cmap('BuGn', 512)
newcmp = ListedColormap(viridisBig(np.linspace(0.4, 1, 256)))
def DE_scatter(area_=(6.0,8.0),nameshort=True,savefig=True,DE_method='deseq2',up_regulated=1):
for compare_group in compare_group_list:
if np.isin( compare_group,compare_list_use):
detable = pd.read_table('output/'+dataset+'/differential_expression/'+exp_mx_name+'/'+compare_group+'/'+DE_method+'.txt'
,index_col=0)
de_plot_mx = pd.DataFrame(data={'feature':detable.index,
'log2FoldChange':detable['log2FoldChange'].tolist(),
'padj':detable['padj'].tolist()})
de_plot_mx.set_index('feature',inplace=True)
if up_regulated:
de_plot_mx['threshold'] = (de_plot_mx['log2FoldChange']>1) & (de_plot_mx['padj']<0.05)
else:
de_plot_mx['threshold'] = (abs(de_plot_mx['log2FoldChange'])>1) & (de_plot_mx['padj']<0.05)
de_plot_mx['-log10(q values)'] = [-math.log10(qvalue) for qvalue in de_plot_mx['padj'].tolist()]
de_plot_mx['color'] = de_plot_mx['threshold']
for i in np.where(de_plot_mx['threshold']==True):
de_plot_mx['color'][i]='#DA706F'
for i in np.where(de_plot_mx['threshold']==False):
de_plot_mx['color'][i]='#5876B9'
if dataset=='scirep':
rpkmtable = cpm_table_origin
matrix_type='cpm'
elif dataset=='lulab_hcc':
rpkmtable = cpm_table_origin
matrix_type='cpm'
elif dataset=='exorbase':
rpkmtable = rpkm_table_origin
matrix_type='rpkm'
elif dataset=='pico_3v3':
rpkmtable = rpkm_table_origin
matrix_type='rpkm'
de_plot_mx['metrics']=de_plot_mx['-log10(q values)']*np.abs(de_plot_mx['log2FoldChange'])
de_plot_mx = de_plot_mx.sort_values(['metrics'],ascending=0).iloc[:10]
de_plot_mx = de_plot_mx.sort_values(['log2FoldChange'],ascending=0)
rpkmtable = rpkmtable.loc[de_plot_mx.index]
de_plot_mx['log2RPKM'] = np.mean(log_transfrom(rpkmtable),axis=1)
de_plot_mx.index = np.array([name.split('|')[2]+'|'+name.split('|')[1]
#+'|'+name.split('|')[3]+'|'+name.split('|')[4]
for name in de_plot_mx.index.values])
fig, (ax) = plt.subplots(1, figsize=(6,3))
im = ax.scatter(de_plot_mx['log2FoldChange'],de_plot_mx.index,s=((de_plot_mx['log2RPKM']/area_[0]-0.5)*area_[1])**2,c=de_plot_mx['-log10(q values)'],cmap=newcmp)
cbar =fig.colorbar(im, ax=ax)
cbar.outline.set_visible(False)
interval = np.max(de_plot_mx.log2RPKM) - np.min(de_plot_mx.log2RPKM)
ratiointer = interval/2
pws = set(np.round(np.arange(np.min(de_plot_mx.log2RPKM),np.max(de_plot_mx.log2RPKM),ratiointer),0).astype(int))
for pw in pws:
ax.scatter([], [], s=((pw/area_[0]-0.5)*area_[1])**2, c="k",label=str(pw))
ax = std_plot(ax,'Fold Change','Feature Name','DE bar plot of '+compare_group+' '+DE_method,'log('+matrix_type+')',
borderpad=0.1,labelspacing=0.2,handletextpad=1,cbar=cbar,cbarlabel='-log10(q values)',xlim=[np.min(de_plot_mx['log2FoldChange'])-0.1,np.max(de_plot_mx['log2FoldChange'])+0.1])
fig.tight_layout()
fig.savefig(save_path+'DE_bar_plot_of_'+compare_group+'.'+DE_method+saveformat)
#embed_pdf_figure()
de_plot_mx.to_csv(save_path+'DE_selected_features.'+compare_group+'.'+DE_method+'.txt',sep='\t')
DE_scatter()
filter_mx=pd.read_table('output/'+dataset+'/matrix_processing/filter.'+exp_mx_name+'.txt')
def div_abu_plot(expression_mx,savefig=True):
total_counts = expression_mx.shape[0]
type_counts_sample = pd.DataFrame()
for samplename in expression_mx.columns.values:
filter_zero_samplename = expression_mx.iloc[np.where(expression_mx[samplename]>0)[0],:]
names = filter_zero_samplename.index
names_type = np.array([names[i].split('|')[1] for i in range(names.shape[0])])
type_counts = np.unique(names_type, return_counts = True)
new = pd.DataFrame({'type' : type_counts[0],
samplename : type_counts[1],
})
new = new.set_index('type')
type_counts_sample = pd.concat([type_counts_sample, new], axis=1)#, join_axes=[df1.index]
typelist = np.unique([expression_mx.index[i].split('|')[1] for i in range(expression_mx.shape[0])])
type_mx = pd.DataFrame(index=typelist,columns=expression_mx.columns)
for sample in expression_mx.columns:
sample_feature = pd.DataFrame(data=expression_mx.loc[:,sample],index=expression_mx.index)
sample_feature['type']=[expression_mx.index[i].split('|')[1] for i in range(expression_mx.shape[0])]
for i in typelist:
type_mx.loc[i,sample] = sample_feature.iloc[np.where(sample_feature['type']==i)].iloc[:,0].sum()
table_ratio = (type_mx/type_mx.sum()).T
xticks = type_counts_sample.index.tolist()
Means = type_counts_sample.mean(axis=1).values.tolist()
Std=type_counts_sample.std(axis=1).values.tolist()
mean_sd = pd.DataFrame(data = {'type':xticks,'mean':Means,'std':Std})
mean_sd = mean_sd.sort_values(by='mean',ascending=False)
mean_sd = mean_sd.set_index('type')
Std = [[0]*len(mean_sd['std'].tolist()),mean_sd['std'].tolist()]
ab = table_ratio*100
xticks_ab = ab.columns.tolist()
Means_ab = ab.mean(axis=0).values.tolist()
Std_ab = ab.std(axis=0).values.tolist()
mean_sd_ab = pd.DataFrame(data = {'type':xticks_ab,'mean_ab':Means_ab,'std_ab':Std_ab})
mean_sd_ab = mean_sd_ab.set_index('type')
N = type_counts_sample.shape[0]
ind = np.arange(N)
merge = pd.concat([mean_sd,mean_sd_ab],axis=1,join_axes=[mean_sd.index])
Std_ab = [[0]*len(merge['std_ab'].tolist()),merge['std_ab'].tolist()]
merge = merge.sort_values(by='mean_ab',ascending=False)
xticks = np.array(merge.index.tolist())
tmpname,tmpcount = np.unique(np.array([i.split('|')[1] for i in expression_mx.index]),return_counts=1)
tmpdataframe = pd.DataFrame(np.zeros(tmpname.shape[0]))
tmpdataframe.index = tmpname
tmpdataframe.iloc[:,0] = tmpcount
fig,(ax,ax1) = plt.subplots(1,2,figsize = (5,2.6))
ax.barh(ind,tmpdataframe.loc[merge.index].iloc[:,0],0.7,xerr=None,color=np.array(['#cc3399','#3300ff','#006699','#339999','#66ffcc','#00ff00','#006600','#FFFF00','#FF6600','#FF0000','#FF9999']))
ax.invert_xaxis()
ax.set_yticks(ind)
ax.set_yticks([])
ax = std_plot(ax,'Number of RNA domains/miRNAs','','Number of RNA domains/miRNAs (cfRNA)',left=False,rotation=0,legendscale=False,legend_adj=False)#,ylim=[np.min(ind),np.max(ind)]
ax1.barh(ind,merge['mean_ab'],0.7,xerr=Std_ab,color=np.array(['#cc3399','#3300ff','#006699','#339999','#66ffcc','#00ff00','#006600','#FFFF00','#FF6600','#FF0000','#FF9999']))
ax1.set_yticks(ind)
xticks[np.where(xticks=='Y_RNA')] = 'Y RNA'
ax1.set_yticklabels(xticks)
ax1 = std_plot(ax1,'Percentage of mapped reads (%)', '','Abundance',rotation=0,legendscale=False,legend_adj=False)
ax1.tick_params(direction='out', pad=4,length=0)
fig.tight_layout(w_pad=0.2)
if savefig is True:
fig.savefig(save_path+'diversity_abundance_cfRNA.eps')
#embed_pdf_figure()
return tmpdataframe.loc[merge.index],_
diversity,ratio =div_abu_plot(expression_mx=filter_mx,
savefig=True)
de_table=pd.read_table('output/'+dataset+'/differential_expression/'+exp_mx_name+'/Normal-HCC/deseq2.txt')
rnaname_tmp,count_tmp = np.unique(np.array([i.split('|')[1] for i in de_table[de_table.padj<=0.05].index]),return_counts=1)
div_df = pd.DataFrame([count_tmp]).T
div_df.index = rnaname_tmp
display_dataframe(div_df.loc[pd.DataFrame(diversity).index])
def plot_pie_de(df):
'''
data: table_ratio
rnanames: rna type names
adjustment: merge RNA with small percent together
'''
x = np.array(df.index)
y = (np.array(df.values)+10e-8).ravel()
y = y/y.sum()
z_ = np.array([x[i] + str(' {:.2f}'.format(y[i]*100)+'%') for i in range(y.shape[0])])
z = np.array([float('{:.10f}'.format(y[i]*100)) for i in range(y.shape[0])])
dataframe = pd.DataFrame(np.concatenate((x.reshape(-1,1),z.reshape(-1,1),z_.reshape(-1,1)),axis=1))
dataframe.columns=['rna','percent','label']
dataframe["percent"] = pd.to_numeric(dataframe["percent"])
dataframe['angle'] = dataframe['percent']/dataframe['percent'].sum() * 2*pi
dataframe['color'] = Category20c[len(x)]
p = figure(height=500,width=750, title="Pie Chart", toolbar_location=None,
tools="hover", tooltips="@label", x_range=(-0.5, 1.0))
p.wedge(x=0.14, y=1, radius=0.45,
start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
line_color="black", fill_color='color', legend="label", source=dataframe)
p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None
show(p)
domain_mx = filter_mx.iloc[np.where(np.array([i.split("|")[1] for i in filter_mx.index])!='genomic')]
domain_size = np.array([int(i.split('|')[-1]) -int(i.split('|')[-2]) for i in domain_mx.index])