update

219c598c · pinyili2 · 534a875f · 219c598c · 219c598c
Commit 219c598c authored 7 months ago by pinyili2
--- a/mrdna/readers/segmentmodel_from_cadnano.py
+++ b/mrdna/readers/segmentmodel_from_cadnano.py
@@ -13,34 +13,17 @@ from ..segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSe
 from ..model.dna_sequence import m13 as m13seq
 import json
 import re
-import cadnano
-from cadnano.document import Document
-## Only testing on cadnano2.5
 ## TODO: separate SegmentModel from ArbdModel so multiple parts can be combined
 ## TODO: catch circular strands in "get_5prime" cadnano calls
 ## TODO: handle special motifs
 ##   - doubly-nicked helices
 ##   - helices that should be stacked across an empty region (crossovers from and end in the helix to another end in the helix)
 ##   - circular constructs
-def get_lattice(part):
-    lattice_type = None
-    _gt = part.getGridType()
-    try:
-        lattice_type = _gt.name.lower()
-    except:
-        if _gt == 1:
-            lattice_type = 'square'
-        elif _gt == 2:
-            lattice_type = 'honeycomb'
-        else:
-            print("WARNING: unable to determine cadnano part lattice type")
-    return lattice_type
 def read_json_file(filename):
    try:
        with open(filename) as ch:
            json_data = json.load(ch)
@@ -55,7 +38,11 @@ def read_json_file(filename):
                # l = re.sub(r"(\w):", r'\1":', l)
                content += l+"\n"
            json_data = json.loads(content)
+    return json_data
+def cadnano_parts(json_data):
+    import cadnano
+    from cadnano.document import Document
    try:
        doc = Document()
        cadnano.fileio.v3decode.decode(doc, json_data)
@@ -77,9 +64,23 @@ def read_json_file(filename):
            for id_num in part.getIdNums():
                if part.vh_properties.loc[id_num,'eulerZ'] == 0:
                    part.vh_properties.loc[id_num,'eulerZ'] = 360*(6/10.5)
-    df=pd.DataFrame(json_data["vstrands"])
+        df=pd.DataFrame(json_data["vstrands"])
-    n_df=df.set_index("num")
+        n_df=df.set_index("num")
-    return part,n_df
+    return part
+def get_lattice(part):
+    lattice_type = None
+    _gt = part.getGridType()
+    try:
+        lattice_type = _gt.name.lower()
+    except:
+        if _gt == 1:
+            lattice_type = 'square'
+        elif _gt == 2:
+            lattice_type = 'honeycomb'
+        else:
+            print("WARNING: unable to determine cadnano part lattice type")
+    return lattice_type
 def get_helix_angle(part, helix_id, indices):
    """ Get "start_orientation" for helix """
@@ -106,81 +107,117 @@ def get_helix_angle(part, helix_id, indices):
    angle = eulerZ + twist_per_base*indices - 0.5*mgroove
    return rotationAboutAxis(np.array((0,0,1)),angle)
-def nttype(scafs):
+def gen_id_series(strand,part):
-    def judge(i):
+    df=pd.DataFrame(columns=["vh","zid","fwd","stack_tuple","threeprime_tuple","x","y","z"],index=range(strand.totalLength()),dtype=object)
-        if i ==[-1,-1,-1,-1]:
+    df["vh"]=strand._id_num
-            return 0
+    df["fwd"]=strand.isForward()
-        else: return 1
+    df["x"]=part.getVirtualHelixOrigin(strand._id_num)[0]*10
-    n=np.array([judge(i) for i in scafs])
+    df["y"]=part.getVirtualHelixOrigin(strand._id_num)[1]*10
-    return n
+    id_lo,id_hi=strand.idxs()
+    zids=[str(i) for i in range(id_lo,id_hi+1)]
+    insert_dict={}
-def gen_prop_table(json_file):
+    insert_dict=dict([(j.idx(),j.length()) for j in strand.insertionsOnStrand()])
-    part,vslist=read_json_file(json_file)
+    z=np.arange(id_lo,id_hi+1)
-    props = part.getModelProperties().copy()
+    zids=[str(i) for i in range(id_lo,id_hi+1)]
-    try:
+    z=list(np.arange(id_lo,id_hi+1))
-        if props.get('point_type') == PointType.ARBITRARY:
+    zids=[str(i) for i in range(id_lo,id_hi+1)]
-            # TODO add code to encode Parts with ARBITRARY point configurations
+    for insert_base in insert_dict:
-            raise NotImplementedError("Not implemented")
+        z_ind=zids.index(str(insert_base))
-    except:
+        z_val=insert_dict[insert_base]
-        try:
+        z_pos_ind=z.index(insert_base)
-            vh_props, origins = part.helixPropertiesAndOrigins()
+        zids.pop(z_ind)
-        except:
+        z.pop(z_pos_ind)
-            origins = {hid:part.getVirtualHelixOrigin(hid)[:2] for hid in part.getidNums()}
+        if z_val!=-1:
-    scaf_id=[nttype(vslist['scaf'][i]) for i in vslist.index]
+            #l=[str(insert_base)+"."+str(i) for i in range(z_val+1)]
-    stap_id=[nttype(vslist['stap'][i]) for i in vslist.index]
+            l=list(range(z_val+1))
-    cad_bps=part.getIndices(0)
+            l.reverse()
-    vslist["scafnt"]=np.sum(np.array(scaf_id),axis=1)
+            for k in l:                
-    vslist["stapnt"]=np.sum(np.array(stap_id),axis=1)
+                zids.insert(z_ind,str(insert_base)+"."+str(k))
-    totnt=np.sum(vslist["scafnt"])+np.sum(vslist["stapnt"])
+                z.insert(z_pos_ind,insert_base+k/(z_val+1))
-    is_scaf=np.zeros(totnt,dtype=bool)
+    df["zid"]=zids
-    is_scaf[0:np.sum(vslist["scafnt"])]=1
+    df["z"]=np.array(z)*3.4
-    nt_prop=pd.DataFrame(index=range(totnt),columns=["vh","zid","is_scaf","r","bp","stack","threeprime","seq","orientation"])
-    nt_prop["is_scaf"]=is_scaf
-    tot_id=scaf_id+stap_id
+    L=[(df["vh"][i],df["zid"][i],df["fwd"][i]) for i in df.index]
-    vhi,zidi=np.where(np.array(scaf_id)==1)
+    if strand.isForward()==True:
-    vhj,zidj=np.where(np.array(stap_id)==1)
+        df["stack_tuple"]=L[1:]+[-1]
-    vhi=vslist.index[vhi]
+        if strand.connection3p() is None:
-    vhj=vslist.index[vhj]
+            df["threeprime_tuple"]=L[1:]+[-1]
-    nt_prop["vh"]=list(vhi)+list(vhj)
+        else:
-    nt_prop["zid"]=list(zidi)+list(zidj)
+            df["threeprime_tuple"]=L[1:]+[(strand.connection3p().idNum(),str(strand.connection3p().idx5Prime()),strand.connection3p().isForward())]
+    else:
+        df["stack_tuple"]=[-1]+L[0:-1]
+        if strand.connection3p() is None:
+            df["threeprime_tuple"]=[-1]+L[0:-1]
+        else:
+            df["threeprime_tuple"]=[(strand.connection3p().idNum(),str(strand.connection3p().idx5Prime()),strand.connection3p().isForward())]+L[0:-1]
+    ## cadnano 3.1 sequence assign is wrong if there is insertion or deletion. 
+    df["r"]=[np.array([df["x"][i],df["y"][i],df["z"][i]]) for i in df.index]
+    return [pd.Series(df.loc[i]) for i in df.index]
+def gen_prop_table(part,seq_file=None,fill_seq="T"):
+    strand_set=[]
+    for i in part.getidNums():
+        fwd,rev=part.getStrandSets(i)
+        [strand_set.append(i) for i in fwd.strands()]
+        [strand_set.append(i) for i in rev.strands()]
+    id_series=[]
+    for i in strand_set:
+        id_series=id_series+gen_id_series(i,part)
+    nt_prop=pd.DataFrame(id_series)
+    nt_prop.reset_index(names=list(range(len(nt_prop.index))),inplace=True)
+    ind_tuple=[(nt_prop["vh"][i],nt_prop["zid"][i],nt_prop["fwd"][i]) for i in nt_prop.index]
+    stacks=[]
+    for i in list(nt_prop["stack_tuple"]):
+        if i ==-1:
+            stacks.append(i)
+        else:
+            stacks.append(ind_tuple.index(i))
+    nt_prop["stack"]=stacks
+    tprime=[]
+    for i in list(nt_prop["threeprime_tuple"]):
+        if i ==-1:
+            tprime.append(i)
+        else:
+            tprime.append(ind_tuple.index(i))
+    nt_prop["threeprime"]=tprime
    vhzid=list(zip(nt_prop["vh"],nt_prop["zid"]))
-    nt_prop["r"]=[part.getCoordinate(helix_id, indices) for helix_id, indices in zip(nt_prop["vh"],nt_prop["zid"])]
-    nt_prop["orientation"]=[get_helix_angle(part, helix_id, indices) for helix_id,indices in zip(nt_prop["vh"],nt_prop["zid"])]
+    nt_prop["orientation"]=[get_helix_angle(part, helix_id, int(float(indices))) for helix_id,indices in vhzid]
    nt_prop=nt_prop.fillna(-1)
-    for i in range(int(len(vhzid)/2)):
+    for i in range(int(len(nt_prop.index)/2)):
        try:
            bp1,bp2=(i,1+i+vhzid[i+1:].index(vhzid[i]))
            nt_prop["bp"][bp1]=bp2
            nt_prop["bp"][bp2]=bp1
        except:
            pass
-    tprime_list=-np.ones(len(nt_prop.index),dtype=int)
+    if seq_file is not None:
-    index2=list(zip(vhzid,nt_prop["is_scaf"]))
+        #under construction
-    for i in range(len(nt_prop.index)):
+        seq=open(seq_file,"r")
-        ((m,n),p)=list(zip(vhzid,nt_prop["is_scaf"]))[i]
+        for seqline in seq:
-        if p==True:
+            try:
-            k,l=(vslist["scaf"][m])[n][2:]
+                int(seqline[0])
-            if k!=-1 and l!=-1:
+            except:
-                n=index2.index(((k,l),True))
+                continue
-                tprime_list[i]=int(n)
-        else:
-            k,l=(vslist["stap"][m])[n][2:]
-            if k!=-1 and l!=-1:
-                n=index2.index(((k,l),False))
-                tprime_list[i]=int(n)
-    nt_prop["threeprime"]=tprime_list
-    (n,)=np.where(nt_prop["threeprime"]==-1)
-    stackid=nt_prop["bp"][[list(nt_prop["threeprime"]).index(i) for i in n]]
-    nt_prop["stack"][stackid.index[np.where(np.array(stackid)!=-1)]]=nt_prop["threeprime"][stackid.index[np.where(np.array(stackid)!=-1)]]
-    ## Todo: sequence    
    return nt_prop
 def mrdna_model_from_cadnano(json_file,seq=None,**model_parameters):
-    nt_prop=gen_prop_table(json_file)
+    json_data=read_json_file(json_file)
+    part,decoder=cadnano_parts(json_data)
+    if decoder==2:
+        nt_prop=gen_prop_table_cad2(json_data,part)
+    elif decoder==3:
+        nt_prop=gen_prop_table_cad3(json_data,part)
    if seq is None:
        if nt_prop["seq"][0]==-1:
            seq=None

--- a/mrdna/readers/segmentmodel_from_cadnano_old.py
+++ b/mrdna/readers/segmentmodel_from_cadnano_old.py
+# -*- coding: utf-8 -*-
+import pdb
+import numpy as np
+import os,sys
+from glob import glob
+import re
+import pandas as pd
+pd.options.mode.chained_assignment = None  # default='warn'
+from .segmentmodel_from_lists import model_from_basepair_stack_3prime
+from ..arbdmodel.coords import readArbdCoords, readAvgArbdCoords, rotationAboutAxis
+from ..segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSegment
+from ..model.dna_sequence import m13 as m13seq
+import json
+import re
+## TODO: separate SegmentModel from ArbdModel so multiple parts can be combined
+## TODO: catch circular strands in "get_5prime" cadnano calls
+## TODO: handle special motifs
+##   - doubly-nicked helices
+##   - helices that should be stacked across an empty region (crossovers from and end in the helix to another end in the helix)
+##   - circular constructs
+def read_json_file(filename):
+    try:
+        with open(filename) as ch:
+            json_data = json.load(ch)
+    except:
+        with open(filename) as ch:
+            content = ""
+            for l in ch:
+                l = re.sub(r"'", r'"', l)
+                # https://stackoverflow.com/questions/4033633/handling-lazy-json-in-python-expecting-property-name
+                # l = re.sub(r"{\s*(\w)", r'{"\1', l)
+                # l = re.sub(r",\s*(\w)", r',"\1', l)
+                # l = re.sub(r"(\w):", r'\1":', l)
+                content += l+"\n"
+            json_data = json.loads(content)
+    return json_data
+def get_lattice(part):
+    lattice_type = None
+    _gt = part.getGridType()
+    try:
+        lattice_type = _gt.name.lower()
+    except:
+        if _gt == 1:
+            lattice_type = 'square'
+        elif _gt == 2:
+            lattice_type = 'honeycomb'
+        else:
+            print("WARNING: unable to determine cadnano part lattice type")
+    return lattice_type
+def cadnano_parts(json_data):
+    import cadnano
+    from cadnano.document import Document
+    try:
+        doc = Document()
+        cadnano.fileio.v3decode.decode(doc, json_data)
+        decoder = 3
+    except:
+        doc = Document()
+        cadnano.fileio.v2decode.decode(doc, json_data)
+        decoder = 2
+    parts = [p for p in doc.getParts()]
+    if len(parts) != 1:
+        raise Exception("Only documents containing a single cadnano part are implemented at this time.")
+    part = parts[0]
+    if decoder == 2:
+        """ It seems cadnano2.5 (as of ce6ff019) does not set the EulerZ for square lattice structures correctly, doing so here """
+        l = get_lattice(part)
+        if l == 'square':
+            for id_num in part.getIdNums():
+                if part.vh_properties.loc[id_num,'eulerZ'] == 0:
+                    part.vh_properties.loc[id_num,'eulerZ'] = 360*(6/10.5)
+    return part,decoder
+def get_helix_angle(part, helix_id, indices):
+    """ Get "start_orientation" for helix """
+        # import ipdb
+        # ipdb.set_trace()
+    """ FROM CADNANO2.5
+    + angle is CCW
+    - angle is CW
+    Right handed DNA rotates clockwise from 5' to 3'
+    we use the convention the 5' end starts at 0 degrees
+    and it's pair is minor_groove_angle degrees away
+    direction, hence the minus signs.  eulerZ
+    """
+    hp, bpr, tpr, eulerZ, mgroove = part.vh_properties.loc[helix_id,
+                                                                    ['helical_pitch',
+                                                                     'bases_per_repeat',
+                                                                     'turns_per_repeat',
+                                                                     'eulerZ',
+                                                                     'minor_groove_angle']]
+    twist_per_base = tpr*360./bpr
+        # angle = eulerZ - twist_per_base*indices + 0.5*mgroove + 180
+    angle = eulerZ + twist_per_base*indices - 0.5*mgroove
+    return rotationAboutAxis(np.array((0,0,1)),angle)
+def nttype_v2(scafs):
+    def judge(i):
+        if i ==[-1,-1,-1,-1]:
+            return 0
+        else: return 1
+    n=np.array([judge(i) for i in scafs])
+    return n
+def gen_prop_table_cad2(json_data,part):
+    df=pd.DataFrame(json_data["vstrands"])
+    vslist=df.set_index("num")
+    props = part.getModelProperties().copy()
+    try:
+        if props.get('point_type') == PointType.ARBITRARY:
+            # TODO add code to encode Parts with ARBITRARY point configurations
+            raise NotImplementedError("Not implemented")
+    except:
+        try:
+            vh_props, origins = part.helixPropertiesAndOrigins()
+        except:
+            origins = {hid:part.getVirtualHelixOrigin(hid)[:2] for hid in part.getidNums()}
+    scaf_id=[nttype_v2(vslist['scaf'][i]) for i in vslist.index]
+    stap_id=[nttype_v2(vslist['stap'][i]) for i in vslist.index]
+    cad_bps=part.getIndices(0)
+    vslist["scafnt"]=np.sum(np.array(scaf_id),axis=1)
+    vslist["stapnt"]=np.sum(np.array(stap_id),axis=1)
+    totnt=np.sum(vslist["scafnt"])+np.sum(vslist["stapnt"])
+    is_scaf=np.zeros(totnt,dtype=bool)
+    is_scaf[0:np.sum(vslist["scafnt"])]=1
+    nt_prop=pd.DataFrame(index=range(totnt),columns=["vh","zid","is_scaf","r","bp","stack","threeprime","seq","orientation"])
+    nt_prop["is_scaf"]=is_scaf
+    tot_id=scaf_id+stap_id
+    vhi,zidi=np.where(np.array(scaf_id)==1)
+    vhj,zidj=np.where(np.array(stap_id)==1)
+    vhi=vslist.index[vhi]
+    vhj=vslist.index[vhj]
+    nt_prop["vh"]=list(vhi)+list(vhj)
+    nt_prop["zid"]=list(zidi)+list(zidj)
+    vhzid=list(zip(nt_prop["vh"],nt_prop["zid"]))
+    nt_prop["r"]=[part.getCoordinate(helix_id, indices) for helix_id, indices in zip(nt_prop["vh"],nt_prop["zid"])]
+    nt_prop["orientation"]=[get_helix_angle(part, helix_id, indices) for helix_id,indices in zip(nt_prop["vh"],nt_prop["zid"])]
+    nt_prop=nt_prop.fillna(-1)
+    for i in range(int(len(vhzid)/2)):
+        try:
+            bp1,bp2=(i,1+i+vhzid[i+1:].index(vhzid[i]))
+            nt_prop["bp"][bp1]=bp2
+            nt_prop["bp"][bp2]=bp1
+        except:
+            pass
+    tprime_list=-np.ones(len(nt_prop.index),dtype=int)
+    index2=list(zip(vhzid,nt_prop["is_scaf"]))
+    for i in range(len(nt_prop.index)):
+        ((m,n),p)=list(zip(vhzid,nt_prop["is_scaf"]))[i]
+        if p==True:
+            k,l=(vslist["scaf"][m])[n][2:]
+            if k!=-1 and l!=-1:
+                n=index2.index(((k,l),True))
+                tprime_list[i]=int(n)
+        else:
+            k,l=(vslist["stap"][m])[n][2:]
+            if k!=-1 and l!=-1:
+                n=index2.index(((k,l),False))
+                tprime_list[i]=int(n)
+    nt_prop["threeprime"]=tprime_list
+    #(n,)=np.where(nt_prop["threeprime"]==-1)
+    #stackid=nt_prop["bp"][[list(nt_prop["threeprime"]).index(i) for i in n]]
+    index3=dict(zip(nt_prop.index,zip(nt_prop["vh"],nt_prop["is_scaf"])))
+    index3[-1]=(-1,-1)
+    nt_prop["stack"]=np.where([index3[i]==index3[nt_prop["threeprime"][i]] for i in (nt_prop.index)],nt_prop["threeprime"],-1)  
+    return nt_prop
+def mrdna_model_from_cadnano(json_file,seq=None,**model_parameters):
+    json_data=read_json_file(json_file)
+    part,decoder=cadnano_parts(json_data)
+    if decoder==2:
+        nt_prop=gen_prop_table_cad2(json_data,part)
+    elif decoder==3:
+        nt_prop=gen_prop_table_cad3(json_data,part)
+    if seq is None:
+        if nt_prop["seq"][0]==-1:
+            seq=None
+        else:
+            seq=nt_prop["seq"]
+    r=np.array(list(nt_prop['r']))
+    bp=np.array(list(nt_prop['bp']))
+    three_prime=np.array((list(nt_prop["threeprime"])))
+    stack=np.array((list(nt_prop["stack"])))
+    orientation=np.array(list(nt_prop["orientation"]))
+    model = model_from_basepair_stack_3prime( r, bp, stack, three_prime, seq, orientation=None, **model_parameters )
+    return model