pinyili2
--- a/mrdna/readers/segmentmodel_from_cadnano.py 0 → 100644

+ 338

− 0
+++ b/mrdna/readers/segmentmodel_from_cadnano.py 0 → 100644

+ 338

− 0
+# -*- coding: utf-8 -*-
+import numpy as np
+import os,sys
+from glob import glob
+import re
+import pandas as pd
+pd.options.mode.chained_assignment = None  # default='warn'
+from .segmentmodel_from_lists import model_from_basepair_stack_3prime
+
+from ..arbdmodel.coords import readArbdCoords, readAvgArbdCoords, rotationAboutAxis
+from ..segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSegment
+from ..model.dna_sequence import m13 as m13seq
+from .. import logger, devlogger
+import json
+import re
+import pdb
+
+
+
+## TODO: separate SegmentModel from ArbdModel so multiple parts can be combined
+## TODO: catch circular strands in "get_5prime" cadnano calls
+## TODO: handle special motifs
+##   - doubly-nicked helices
+##   - helices that should be stacked across an empty region (crossovers from and end in the helix to another end in the helix)
+##   - circular constructs
+
+
+def get_lattice(part):
+    lattice_type = None
+    _gt = part.getGridType()
+    try:
+        lattice_type = _gt.name.lower()
+    except:
+        if _gt == 1:
+            lattice_type = 'square'
+        elif _gt == 2:
+            lattice_type = 'honeycomb'
+        else:
+            print(lattice_type)
+    return lattice_type
+
+
+def read_json_file(filename):
+    import cadnano
+    from cadnano.document import Document
+
+    try:
+        with open(filename) as ch:
+            json_data = json.load(ch)
+    except:
+        with open(filename) as ch:
+            content = ""
+            for l in ch:
+                l = re.sub(r"'", r'"', l)
+                # https://stackoverflow.com/questions/4033633/handling-lazy-json-in-python-expecting-property-name
+                # l = re.sub(r"{\s*(\w)", r'{"\1', l)
+                # l = re.sub(r",\s*(\w)", r',"\1', l)
+                # l = re.sub(r"(\w):", r'\1":', l)
+                content += l+"\n"
+            json_data = json.loads(content)
+
+    try:
+        doc = Document()
+        cadnano.fileio.v3decode.decode(doc, json_data)
+        decoder = 3
+    except:
+        doc = Document()
+        cadnano.fileio.v2decode.decode(doc, json_data)
+        decoder = 2
+
+    parts = [p for p in doc.getParts()]
+    if len(parts) != 1:
+        raise Exception("Only documents containing a single cadnano part are implemented at this time.")
+    part = parts[0]
+
+    if decoder == 2:
+        """ It seems cadnano2.5 (as of ce6ff019) does not set the EulerZ for square lattice structures correctly, doing so here """
+        l = get_lattice(part)
+        if l == 'square':
+            for id_num in part.getIdNums():
+                if part.vh_properties.loc[id_num,'eulerZ'] == 0:
+                    part.vh_properties.loc[id_num,'eulerZ'] = 360*(6/10.5)
+        df=pd.DataFrame(json_data["vstrands"])
+        n_df=df.set_index("num")
+    return part
+
+def get_helix_angle(part, helix_id, indices, fwd=True):
+    """ Get "start_orientation" for helix """
+        # import ipdb
+        # ipdb.set_trace()
+
+    """ FROM CADNANO2.5
+    + angle is CCW
+    - angle is CW
+    Right handed DNA rotates clockwise from 5' to 3'
+    we use the convention the 5' end starts at 0 degrees
+    and it's pair is minor_groove_angle degrees away
+    direction, hence the minus signs.  eulerZ
+    """
+
+    hp, bpr, tpr, eulerZ, mgroove = part.vh_properties.loc[helix_id,
+                                                                    ['helical_pitch',
+                                                                     'bases_per_repeat',
+                                                                     'turns_per_repeat',
+                                                                     'eulerZ',
+                                                                     'minor_groove_angle']]
+    twist_per_base = tpr*360./bpr
+        # angle = eulerZ - twist_per_base*indices + 0.5*mgroove + 180
+    angle = eulerZ - twist_per_base*indices - 0.5*mgroove
+    R = rotationAboutAxis(np.array((0,0,1)),angle)
+    if fwd: R = R.dot(rotationAboutAxis((1,0,0),180))
+    return R
+
+def gen_id_series(strand, part, insertion_dict):
+    vh = strand._id_num
+    df=pd.DataFrame(columns=["vh","zid","fwd","stack_tuple","threeprime_tuple","x","y","z"],index=range(strand.totalLength()),dtype=object)
+    df["vh"]=vh
+    df["fwd"]=strand.isForward()
+    df["x"]=part.getVirtualHelixOrigin(strand._id_num)[0]*10
+    df["y"]=part.getVirtualHelixOrigin(strand._id_num)[1]*10
+    id_lo,id_hi=strand.idxs()
+    z=list(np.arange(id_lo,id_hi+1))
+    zids=[str(i) for i in range(id_lo,id_hi+1)]
+
+    insert_dict=dict([(j.idx(),j.length()) for j in strand.insertionsOnStrand()])
+    for insert_base in insert_dict:
+        z_ind=zids.index(str(insert_base))
+        z_val=insert_dict[insert_base]
+        z_pos_ind=z.index(insert_base)
+
+        if vh not in insertion_dict:
+            insertion_dict[vh] = dict()
+        insertion_dict[vh][insert_base] = z_val # record insertions for later
+
+        zids.pop(z_ind)
+        z.pop(z_pos_ind)
+        if z_val!=-1:
+            #l=[str(insert_base)+"."+str(i) for i in range(z_val+1)]
+            l=list(range(z_val+1))
+            l.reverse()
+            for k in l:                
+                zids.insert(z_ind,str(insert_base)+"."+str(k))
+                z.insert(z_pos_ind,insert_base+k/(z_val+1))
+    df["zid"] = zids
+    df["z"]   = -3.4 * np.linspace( id_lo+0.25, id_hi-0.25, len(df) ) # Note: this spreads
+                                                                      # insertions uniformly through
+                                                                      # the strand, not through
+                                                                      # helical region like original did
+
+    L=[(df["vh"][i],df["zid"][i],df["fwd"][i]) for i in df.index]
+    if strand.isForward()==True:
+        df["stack_tuple"]=L[1:]+[-1]
+        if strand.connection3p() is None:
+            df["threeprime_tuple"]=L[1:]+[-1]
+        else:
+            df["threeprime_tuple"]=L[1:]+[(strand.connection3p().idNum(),str(strand.connection3p().idx5Prime()),strand.connection3p().isForward())]
+        
+    
+    else:
+        df["stack_tuple"]=[-1]+L[0:-1]
+        if strand.connection3p() is None:
+            df["threeprime_tuple"]=[-1]+L[0:-1]
+        else:
+            df["threeprime_tuple"]=[(strand.connection3p().idNum(),str(strand.connection3p().idx5Prime()),strand.connection3p().isForward())]+L[0:-1]
+    ## cadnano 3.1 sequence assign is wrong if there is insertion or deletion. 
+    df["r"]=[np.array([df["x"][i],df["y"][i],df["z"][i]],dtype=np.float32) for i in df.index]
+    
+    return [pd.Series(df.loc[i]) for i in df.index]
+
+
+def gen_prop_table(part):
+    devlogger.info(f'  Getting strand sets')
+    strand_set=[]
+    id_series=[]
+    insertion_dict = dict()
+    for i in part.getidNums():
+        fwd,rev=part.getStrandSets(i)
+        [strand_set.append(i) for i in fwd.strands()]
+        [strand_set.append(i) for i in rev.strands()]
+    for i in strand_set:
+        id_series=id_series+gen_id_series(i,part, insertion_dict)
+
+    devlogger.info(f'  Getting sequences')
+    nt_prop=pd.DataFrame(id_series)
+    nt_prop.reset_index(inplace=True)
+    nt_prop["seq"]=-1
+    ind_tuple=list(zip(nt_prop["vh"],nt_prop["zid"],nt_prop["fwd"]))
+
+    devlogger.info(f'  Stacking strand ends')
+    not_stacked,=np.where(nt_prop["stack_tuple"]==-1)
+
+    def _get_next(vh, zid, fwd):
+        nzid = zid + fwd*2-1
+        try:
+            insertions = insertion_dict[vh][nzid]
+        except:
+            insertions = 0
+        if insertions < 0:   return _get_next(vh, nzid, fwd)
+        elif insertions > 0: return f'{nzid}.0'
+        return str(nzid)
+
+    for i in not_stacked:
+        vh,zid,fwd = [nt_prop.loc[i][key] for key in ('vh','zid','fwd')]
+        devlogger.debug(f' Checking missing stack at {(vh,zid,fwd)}')
+        zid= _get_next(vh, int(zid.split('.')[0]), fwd)
+        try:
+            _tup = (vh,zid,fwd)
+            ind_tuple.index(_tup) # continue if _tup not in ind_tuple
+            nt_prop["stack_tuple"][i] = _tup
+            devlogger.debug(f'  Found missing stack {_tup}')
+        except:
+            continue
+
+    devlogger.info(f'  Building stacks')
+    stacks=[]
+    for i in list(nt_prop["stack_tuple"]):
+        if i ==-1:
+            stacks.append(i)
+        else:
+            stacks.append(ind_tuple.index(i))
+    nt_prop["stack"]=stacks
+    devlogger.info(f'  Building 3prime')
+    tprime=[]
+    for i in list(nt_prop["threeprime_tuple"]):
+        if i ==-1:
+            tprime.append(i)
+        else:
+            tprime.append(ind_tuple.index(i))
+    nt_prop["threeprime"]=tprime
+
+    devlogger.info(f'  Finding orientations')
+    # nt_prop["orientation"]=[get_helix_angle(part, helix_id, int(float(indices)), _fwd)
+    #  for helix_id,indices,_fwd in zip(nt_prop["vh"],nt_prop["zid"],nt_prop['fwd'])]
+    nt_prop["orientation"]=[get_helix_angle(part, helix_id, int(float(indices)), _fwd)
+                            for helix_id,indices,_fwd in zip(nt_prop["vh"],nt_prop["z"]/3.4,nt_prop['fwd'])]
+    nt_prop=nt_prop.fillna(-1)
+    counter=-1
+    devlogger.info(f'  Building bp and bp_map orientations')
+    bp=-np.ones(len(nt_prop.index),dtype=int)
+    bp_map=dict(zip(ind_tuple,nt_prop.index))
+    for i,j,k in ind_tuple:
+        counter+=1
+        try:
+            bp[counter]=bp_map[(i,j,not(k))]
+        except:
+            pass
+    nt_prop["bp"]=bp
+
+    return nt_prop
+
+def mrdna_model_from_cadnano(json_file,seq=None,**model_parameters):
+    devlogger.info(f'Reading {json_file}')
+    part=read_json_file(json_file)
+    
+    devlogger.info(f'Generating property table')
+    nt_prop=gen_prop_table(part)
+
+    devlogger.info(f'Building arrays')
+    if seq is None:
+        if nt_prop["seq"][0]==-1:
+            seq=None
+        else:
+            seq=nt_prop["seq"]
+    
+    r=np.array(list(nt_prop['r']))
+    bp=np.array(list(nt_prop['bp']))
+    three_prime=np.array((list(nt_prop["threeprime"])))
+    stack=np.array((list(nt_prop["stack"])))
+    orientation=np.array(list(nt_prop["orientation"]))
+    rank = -np.array(list(nt_prop["z"])) # arrange segments in usual way
+    _vh = np.array(list(nt_prop["vh"]))
+
+    devlogger.info(f'Building model from lists')
+    model = model_from_basepair_stack_3prime( r, bp, stack, three_prime, seq, orientation,
+                                              rank = rank, explicitly_unstacked = np.argwhere(stack == -1),
+                                              **model_parameters )
+
+    devlogger.info(f'Fixing segment names')
+    hmap = model._reader_list_hmap
+    hrank = model._reader_list_hrank
+    fwd = model._reader_list_fwd
+    _starts = dict()
+    for seg in model.segments:
+        hid = int(seg._helix_idx)
+        _tmp = fwd & (hmap == hid) # select forward nts in helix
+        _list_ids = np.argwhere(_tmp).flatten()
+        _list_ids = _list_ids[np.argsort( nt_prop['zid'][_list_ids] )]
+
+        start_idx, end_idx = (_list_ids[0],_list_ids[-1])
+        for _a in (start_idx, end_idx): assert( _a.size == 1 )
+        start = max(map(float, [nt_prop['zid'][i] for i in (start_idx,end_idx)] ))
+
+        cadnano_helix = int(nt_prop['vh'][start_idx])
+        if cadnano_helix not in _starts: _starts[cadnano_helix] = []
+        _starts[cadnano_helix].append( (seg,start) )
+
+        ## Assign beta and occupancy
+        seg._cadnano_bp_to_zidx = np.array( np.floor(nt_prop['zid'][_list_ids].astype(float)) )
+
+        seg.occupancy = cadnano_helix
+        def callback(segment):
+            for b in segment.beads:
+                bp = int(round(b.get_nt_position(segment)))
+                if bp < 0: bp = 0
+                if bp >= segment.num_nt: bp = segment.num_nt-1
+                try:
+                    b.beta = segment._cadnano_bp_to_zidx[bp]
+                    if 'orientation_bead' in b.__dict__:
+                        b.orientation_bead.beta = segment._cadnano_bp_to_zidx[bp]
+                except:
+                    pass
+        seg._generate_bead_callbacks.append(callback)
+
+        def atomic_callback(nucleotide, bp_to_zidx=seg._cadnano_bp_to_zidx):
+            nt = nucleotide
+            segment = nucleotide.parent.segment
+            bp = int(round(segment.contour_to_nt_pos( nt.contour_position )))
+            if bp < 0: bp = 0
+            if bp >= segment.num_nt: bp = segment.num_nt-1
+            try:
+                nt.beta = segment.bp_to_zidx[bp]
+                nt.parent.occupancy = segment.occupancy
+            except:
+                pass
+        seg._generate_nucleotide_callbacks.append(atomic_callback)
+
+
+    for vh,ss in _starts.items():
+        devlogger.debug(f'Renaming segs in {vh}')
+        for i,(seg,s) in enumerate(sorted(ss, key=lambda x: x[-1])):
+            devlogger.debug(f'  seg {i}: {seg} {s} {seg.end3.connection}')
+            _newname = f'{vh:d}-{i:d}'
+            devlogger.debug(f'  {seg} -> {_newname}')
+            seg.name = _newname
+            seg.segname = seg.name
+
+    model._dataframe=nt_prop
+    return model