diff --git a/mrdna/readers/segmentmodel_from_cadnano.py b/mrdna/readers/segmentmodel_from_cadnano.py index 1d4f7cdf2c3bf1f5b795e96d4c3c0b192186d3d2..610be1702e26e7f32c9c2b6ec82fa6b2ae5d5c0a 100644 --- a/mrdna/readers/segmentmodel_from_cadnano.py +++ b/mrdna/readers/segmentmodel_from_cadnano.py @@ -13,34 +13,17 @@ from ..segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSe from ..model.dna_sequence import m13 as m13seq import json import re -import cadnano -from cadnano.document import Document -## Only testing on cadnano2.5 + ## TODO: separate SegmentModel from ArbdModel so multiple parts can be combined ## TODO: catch circular strands in "get_5prime" cadnano calls ## TODO: handle special motifs ## - doubly-nicked helices ## - helices that should be stacked across an empty region (crossovers from and end in the helix to another end in the helix) ## - circular constructs -def get_lattice(part): - lattice_type = None - _gt = part.getGridType() - try: - lattice_type = _gt.name.lower() - except: - if _gt == 1: - lattice_type = 'square' - elif _gt == 2: - lattice_type = 'honeycomb' - else: - print("WARNING: unable to determine cadnano part lattice type") - return lattice_type - def read_json_file(filename): - try: with open(filename) as ch: json_data = json.load(ch) @@ -55,7 +38,11 @@ def read_json_file(filename): # l = re.sub(r"(\w):", r'\1":', l) content += l+"\n" json_data = json.loads(content) + return json_data +def cadnano_parts(json_data): + import cadnano + from cadnano.document import Document try: doc = Document() cadnano.fileio.v3decode.decode(doc, json_data) @@ -77,9 +64,23 @@ def read_json_file(filename): for id_num in part.getIdNums(): if part.vh_properties.loc[id_num,'eulerZ'] == 0: part.vh_properties.loc[id_num,'eulerZ'] = 360*(6/10.5) - df=pd.DataFrame(json_data["vstrands"]) - n_df=df.set_index("num") - return part,n_df + df=pd.DataFrame(json_data["vstrands"]) + n_df=df.set_index("num") + return part + +def get_lattice(part): + lattice_type = None + _gt = part.getGridType() + try: + lattice_type = _gt.name.lower() + except: + if _gt == 1: + lattice_type = 'square' + elif _gt == 2: + lattice_type = 'honeycomb' + else: + print("WARNING: unable to determine cadnano part lattice type") + return lattice_type def get_helix_angle(part, helix_id, indices): """ Get "start_orientation" for helix """ @@ -106,81 +107,117 @@ def get_helix_angle(part, helix_id, indices): angle = eulerZ + twist_per_base*indices - 0.5*mgroove return rotationAboutAxis(np.array((0,0,1)),angle) -def nttype(scafs): - def judge(i): - if i ==[-1,-1,-1,-1]: - return 0 - else: return 1 - n=np.array([judge(i) for i in scafs]) - return n - - -def gen_prop_table(json_file): - part,vslist=read_json_file(json_file) - props = part.getModelProperties().copy() - try: - if props.get('point_type') == PointType.ARBITRARY: - # TODO add code to encode Parts with ARBITRARY point configurations - raise NotImplementedError("Not implemented") - except: - try: - vh_props, origins = part.helixPropertiesAndOrigins() - except: - origins = {hid:part.getVirtualHelixOrigin(hid)[:2] for hid in part.getidNums()} - scaf_id=[nttype(vslist['scaf'][i]) for i in vslist.index] - stap_id=[nttype(vslist['stap'][i]) for i in vslist.index] - cad_bps=part.getIndices(0) - vslist["scafnt"]=np.sum(np.array(scaf_id),axis=1) - vslist["stapnt"]=np.sum(np.array(stap_id),axis=1) - totnt=np.sum(vslist["scafnt"])+np.sum(vslist["stapnt"]) - is_scaf=np.zeros(totnt,dtype=bool) - is_scaf[0:np.sum(vslist["scafnt"])]=1 - nt_prop=pd.DataFrame(index=range(totnt),columns=["vh","zid","is_scaf","r","bp","stack","threeprime","seq","orientation"]) - nt_prop["is_scaf"]=is_scaf - tot_id=scaf_id+stap_id - vhi,zidi=np.where(np.array(scaf_id)==1) - vhj,zidj=np.where(np.array(stap_id)==1) - vhi=vslist.index[vhi] - vhj=vslist.index[vhj] - nt_prop["vh"]=list(vhi)+list(vhj) - nt_prop["zid"]=list(zidi)+list(zidj) +def gen_id_series(strand,part): + df=pd.DataFrame(columns=["vh","zid","fwd","stack_tuple","threeprime_tuple","x","y","z"],index=range(strand.totalLength()),dtype=object) + df["vh"]=strand._id_num + df["fwd"]=strand.isForward() + df["x"]=part.getVirtualHelixOrigin(strand._id_num)[0]*10 + df["y"]=part.getVirtualHelixOrigin(strand._id_num)[1]*10 + id_lo,id_hi=strand.idxs() + zids=[str(i) for i in range(id_lo,id_hi+1)] + insert_dict={} + insert_dict=dict([(j.idx(),j.length()) for j in strand.insertionsOnStrand()]) + z=np.arange(id_lo,id_hi+1) + zids=[str(i) for i in range(id_lo,id_hi+1)] + z=list(np.arange(id_lo,id_hi+1)) + zids=[str(i) for i in range(id_lo,id_hi+1)] + for insert_base in insert_dict: + z_ind=zids.index(str(insert_base)) + z_val=insert_dict[insert_base] + z_pos_ind=z.index(insert_base) + zids.pop(z_ind) + z.pop(z_pos_ind) + if z_val!=-1: + #l=[str(insert_base)+"."+str(i) for i in range(z_val+1)] + l=list(range(z_val+1)) + l.reverse() + for k in l: + zids.insert(z_ind,str(insert_base)+"."+str(k)) + z.insert(z_pos_ind,insert_base+k/(z_val+1)) + df["zid"]=zids + df["z"]=np.array(z)*3.4 + + + L=[(df["vh"][i],df["zid"][i],df["fwd"][i]) for i in df.index] + if strand.isForward()==True: + df["stack_tuple"]=L[1:]+[-1] + if strand.connection3p() is None: + df["threeprime_tuple"]=L[1:]+[-1] + else: + df["threeprime_tuple"]=L[1:]+[(strand.connection3p().idNum(),str(strand.connection3p().idx5Prime()),strand.connection3p().isForward())] + + + else: + df["stack_tuple"]=[-1]+L[0:-1] + if strand.connection3p() is None: + df["threeprime_tuple"]=[-1]+L[0:-1] + else: + df["threeprime_tuple"]=[(strand.connection3p().idNum(),str(strand.connection3p().idx5Prime()),strand.connection3p().isForward())]+L[0:-1] + ## cadnano 3.1 sequence assign is wrong if there is insertion or deletion. + df["r"]=[np.array([df["x"][i],df["y"][i],df["z"][i]]) for i in df.index] + + return [pd.Series(df.loc[i]) for i in df.index] + +def gen_prop_table(part,seq_file=None,fill_seq="T"): + strand_set=[] + for i in part.getidNums(): + fwd,rev=part.getStrandSets(i) + [strand_set.append(i) for i in fwd.strands()] + [strand_set.append(i) for i in rev.strands()] + id_series=[] + for i in strand_set: + id_series=id_series+gen_id_series(i,part) + + nt_prop=pd.DataFrame(id_series) + nt_prop.reset_index(names=list(range(len(nt_prop.index))),inplace=True) + + ind_tuple=[(nt_prop["vh"][i],nt_prop["zid"][i],nt_prop["fwd"][i]) for i in nt_prop.index] + stacks=[] + for i in list(nt_prop["stack_tuple"]): + if i ==-1: + stacks.append(i) + else: + stacks.append(ind_tuple.index(i)) + nt_prop["stack"]=stacks + tprime=[] + for i in list(nt_prop["threeprime_tuple"]): + if i ==-1: + tprime.append(i) + else: + tprime.append(ind_tuple.index(i)) + nt_prop["threeprime"]=tprime vhzid=list(zip(nt_prop["vh"],nt_prop["zid"])) - nt_prop["r"]=[part.getCoordinate(helix_id, indices) for helix_id, indices in zip(nt_prop["vh"],nt_prop["zid"])] - nt_prop["orientation"]=[get_helix_angle(part, helix_id, indices) for helix_id,indices in zip(nt_prop["vh"],nt_prop["zid"])] + + nt_prop["orientation"]=[get_helix_angle(part, helix_id, int(float(indices))) for helix_id,indices in vhzid] nt_prop=nt_prop.fillna(-1) - for i in range(int(len(vhzid)/2)): + for i in range(int(len(nt_prop.index)/2)): try: bp1,bp2=(i,1+i+vhzid[i+1:].index(vhzid[i])) nt_prop["bp"][bp1]=bp2 nt_prop["bp"][bp2]=bp1 except: pass - tprime_list=-np.ones(len(nt_prop.index),dtype=int) - index2=list(zip(vhzid,nt_prop["is_scaf"])) - for i in range(len(nt_prop.index)): - ((m,n),p)=list(zip(vhzid,nt_prop["is_scaf"]))[i] - if p==True: - k,l=(vslist["scaf"][m])[n][2:] - if k!=-1 and l!=-1: - n=index2.index(((k,l),True)) - tprime_list[i]=int(n) - - else: - k,l=(vslist["stap"][m])[n][2:] - if k!=-1 and l!=-1: - n=index2.index(((k,l),False)) - tprime_list[i]=int(n) - nt_prop["threeprime"]=tprime_list - (n,)=np.where(nt_prop["threeprime"]==-1) - stackid=nt_prop["bp"][[list(nt_prop["threeprime"]).index(i) for i in n]] - nt_prop["stack"][stackid.index[np.where(np.array(stackid)!=-1)]]=nt_prop["threeprime"][stackid.index[np.where(np.array(stackid)!=-1)]] - ## Todo: sequence - + if seq_file is not None: + #under construction + seq=open(seq_file,"r") + for seqline in seq: + try: + int(seqline[0]) + except: + continue + return nt_prop + def mrdna_model_from_cadnano(json_file,seq=None,**model_parameters): - nt_prop=gen_prop_table(json_file) + json_data=read_json_file(json_file) + part,decoder=cadnano_parts(json_data) + if decoder==2: + nt_prop=gen_prop_table_cad2(json_data,part) + elif decoder==3: + nt_prop=gen_prop_table_cad3(json_data,part) + if seq is None: if nt_prop["seq"][0]==-1: seq=None diff --git a/mrdna/readers/segmentmodel_from_cadnano_old.py b/mrdna/readers/segmentmodel_from_cadnano_old.py new file mode 100644 index 0000000000000000000000000000000000000000..55df7ea08b600b3115ea77dce9a1425b033fce94 --- /dev/null +++ b/mrdna/readers/segmentmodel_from_cadnano_old.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +import pdb +import numpy as np +import os,sys +from glob import glob +import re +import pandas as pd +pd.options.mode.chained_assignment = None # default='warn' +from .segmentmodel_from_lists import model_from_basepair_stack_3prime + +from ..arbdmodel.coords import readArbdCoords, readAvgArbdCoords, rotationAboutAxis +from ..segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSegment +from ..model.dna_sequence import m13 as m13seq +import json +import re + + + +## TODO: separate SegmentModel from ArbdModel so multiple parts can be combined +## TODO: catch circular strands in "get_5prime" cadnano calls +## TODO: handle special motifs +## - doubly-nicked helices +## - helices that should be stacked across an empty region (crossovers from and end in the helix to another end in the helix) +## - circular constructs + +def read_json_file(filename): + try: + with open(filename) as ch: + json_data = json.load(ch) + except: + with open(filename) as ch: + content = "" + for l in ch: + l = re.sub(r"'", r'"', l) + # https://stackoverflow.com/questions/4033633/handling-lazy-json-in-python-expecting-property-name + # l = re.sub(r"{\s*(\w)", r'{"\1', l) + # l = re.sub(r",\s*(\w)", r',"\1', l) + # l = re.sub(r"(\w):", r'\1":', l) + content += l+"\n" + json_data = json.loads(content) + return json_data + +def get_lattice(part): + lattice_type = None + _gt = part.getGridType() + try: + lattice_type = _gt.name.lower() + except: + if _gt == 1: + lattice_type = 'square' + elif _gt == 2: + lattice_type = 'honeycomb' + else: + print("WARNING: unable to determine cadnano part lattice type") + return lattice_type + +def cadnano_parts(json_data): + import cadnano + from cadnano.document import Document + try: + doc = Document() + cadnano.fileio.v3decode.decode(doc, json_data) + decoder = 3 + except: + doc = Document() + cadnano.fileio.v2decode.decode(doc, json_data) + decoder = 2 + + parts = [p for p in doc.getParts()] + if len(parts) != 1: + raise Exception("Only documents containing a single cadnano part are implemented at this time.") + part = parts[0] + + if decoder == 2: + """ It seems cadnano2.5 (as of ce6ff019) does not set the EulerZ for square lattice structures correctly, doing so here """ + l = get_lattice(part) + if l == 'square': + for id_num in part.getIdNums(): + if part.vh_properties.loc[id_num,'eulerZ'] == 0: + part.vh_properties.loc[id_num,'eulerZ'] = 360*(6/10.5) + + return part,decoder + + +def get_helix_angle(part, helix_id, indices): + """ Get "start_orientation" for helix """ + # import ipdb + # ipdb.set_trace() + + """ FROM CADNANO2.5 + + angle is CCW + - angle is CW + Right handed DNA rotates clockwise from 5' to 3' + we use the convention the 5' end starts at 0 degrees + and it's pair is minor_groove_angle degrees away + direction, hence the minus signs. eulerZ + """ + + hp, bpr, tpr, eulerZ, mgroove = part.vh_properties.loc[helix_id, + ['helical_pitch', + 'bases_per_repeat', + 'turns_per_repeat', + 'eulerZ', + 'minor_groove_angle']] + twist_per_base = tpr*360./bpr + # angle = eulerZ - twist_per_base*indices + 0.5*mgroove + 180 + angle = eulerZ + twist_per_base*indices - 0.5*mgroove + return rotationAboutAxis(np.array((0,0,1)),angle) + +def nttype_v2(scafs): + def judge(i): + if i ==[-1,-1,-1,-1]: + return 0 + else: return 1 + n=np.array([judge(i) for i in scafs]) + return n + + +def gen_prop_table_cad2(json_data,part): + + df=pd.DataFrame(json_data["vstrands"]) + vslist=df.set_index("num") + props = part.getModelProperties().copy() + + try: + if props.get('point_type') == PointType.ARBITRARY: + # TODO add code to encode Parts with ARBITRARY point configurations + raise NotImplementedError("Not implemented") + except: + try: + vh_props, origins = part.helixPropertiesAndOrigins() + except: + origins = {hid:part.getVirtualHelixOrigin(hid)[:2] for hid in part.getidNums()} + scaf_id=[nttype_v2(vslist['scaf'][i]) for i in vslist.index] + stap_id=[nttype_v2(vslist['stap'][i]) for i in vslist.index] + cad_bps=part.getIndices(0) + vslist["scafnt"]=np.sum(np.array(scaf_id),axis=1) + vslist["stapnt"]=np.sum(np.array(stap_id),axis=1) + totnt=np.sum(vslist["scafnt"])+np.sum(vslist["stapnt"]) + is_scaf=np.zeros(totnt,dtype=bool) + is_scaf[0:np.sum(vslist["scafnt"])]=1 + nt_prop=pd.DataFrame(index=range(totnt),columns=["vh","zid","is_scaf","r","bp","stack","threeprime","seq","orientation"]) + nt_prop["is_scaf"]=is_scaf + tot_id=scaf_id+stap_id + vhi,zidi=np.where(np.array(scaf_id)==1) + vhj,zidj=np.where(np.array(stap_id)==1) + vhi=vslist.index[vhi] + vhj=vslist.index[vhj] + nt_prop["vh"]=list(vhi)+list(vhj) + nt_prop["zid"]=list(zidi)+list(zidj) + vhzid=list(zip(nt_prop["vh"],nt_prop["zid"])) + nt_prop["r"]=[part.getCoordinate(helix_id, indices) for helix_id, indices in zip(nt_prop["vh"],nt_prop["zid"])] + nt_prop["orientation"]=[get_helix_angle(part, helix_id, indices) for helix_id,indices in zip(nt_prop["vh"],nt_prop["zid"])] + nt_prop=nt_prop.fillna(-1) + for i in range(int(len(vhzid)/2)): + try: + bp1,bp2=(i,1+i+vhzid[i+1:].index(vhzid[i])) + nt_prop["bp"][bp1]=bp2 + nt_prop["bp"][bp2]=bp1 + except: + pass + tprime_list=-np.ones(len(nt_prop.index),dtype=int) + index2=list(zip(vhzid,nt_prop["is_scaf"])) + for i in range(len(nt_prop.index)): + ((m,n),p)=list(zip(vhzid,nt_prop["is_scaf"]))[i] + if p==True: + k,l=(vslist["scaf"][m])[n][2:] + if k!=-1 and l!=-1: + n=index2.index(((k,l),True)) + tprime_list[i]=int(n) + + else: + k,l=(vslist["stap"][m])[n][2:] + if k!=-1 and l!=-1: + n=index2.index(((k,l),False)) + tprime_list[i]=int(n) + nt_prop["threeprime"]=tprime_list + #(n,)=np.where(nt_prop["threeprime"]==-1) + #stackid=nt_prop["bp"][[list(nt_prop["threeprime"]).index(i) for i in n]] + index3=dict(zip(nt_prop.index,zip(nt_prop["vh"],nt_prop["is_scaf"]))) + index3[-1]=(-1,-1) + nt_prop["stack"]=np.where([index3[i]==index3[nt_prop["threeprime"][i]] for i in (nt_prop.index)],nt_prop["threeprime"],-1) + + return nt_prop + + +def mrdna_model_from_cadnano(json_file,seq=None,**model_parameters): + json_data=read_json_file(json_file) + part,decoder=cadnano_parts(json_data) + if decoder==2: + nt_prop=gen_prop_table_cad2(json_data,part) + elif decoder==3: + nt_prop=gen_prop_table_cad3(json_data,part) + + if seq is None: + if nt_prop["seq"][0]==-1: + seq=None + else: + seq=nt_prop["seq"] + + r=np.array(list(nt_prop['r'])) + bp=np.array(list(nt_prop['bp'])) + three_prime=np.array((list(nt_prop["threeprime"]))) + stack=np.array((list(nt_prop["stack"]))) + orientation=np.array(list(nt_prop["orientation"])) + model = model_from_basepair_stack_3prime( r, bp, stack, three_prime, seq, orientation=None, **model_parameters ) + return model