pinyili2 · 45445532 · eab065f0 · bf407268 · ee4032a6 · 6f01c8d2
--- a/mrdna/readers/segmentmodel_from_oxdna.py 0 → 100644

+ 225

− 0
+++ b/mrdna/readers/segmentmodel_from_oxdna.py 0 → 100644

+ 225

− 0
+from mrdna import logger, devlogger
+from .segmentmodel_from_lists import model_from_basepair_stack_3prime
+from ..arbdmodel.coords import rotationAboutAxis
+import pickle
+import pandas as pd
+import sys
+import mrdna.readers.libs as libs
+import numpy as np
+from scipy.spatial import distance_matrix
+pd.options.mode.chained_assignment = None  # default='warn'
+
+_seq_to_int_dict = dict(A=0,T=1,C=2,G=3)
+_seq_to_int_dict = {k:str(v) for k,v in _seq_to_int_dict.items()}
+
+_yrot = rotationAboutAxis(axis=(0,1,0), angle=180).dot(rotationAboutAxis(axis=(0,0,1),angle=-40))
+
+def mrdna_model_from_oxdna(coordinate_file, topology_file,virt2nuc=None,get_nt_prop=False, **model_parameters):
+    """ Construct an mrdna model from oxDNA coordinate and topology files """
+    top_data = np.loadtxt(topology_file, skiprows=1,
+                          unpack=True,
+                          dtype=np.dtype('i4,U1,i4,i4')
+                          )
+    conf_data = np.loadtxt(coordinate_file, skiprows=3)
+    def _get_bp(sequence=None):
+        dists = distance_matrix(r,basepair_pos) + np.eye(len(r))*1000
+        dists = 0.5*(dists + dists.T)
+        
+        bp = np.array([np.argmin(da) for da in dists])
+
+        for i,j in enumerate(bp):
+            if j == -1: continue
+            # devlogger.info(f'bp {i} {j} {dists[i,j]}')
+            if dists[i,j] > 2:
+                bp[i] = -1
+            elif bp[j] != i:
+                bpj = bp[j]
+                logger.warning( " ".join([str(_x) for _x in ["Bad pair", i, j, bp[i], bp[j], dists[i,j], dists[j,i], dists[bpj,j], dists[j,bpj]]]) )
+
+        for i,j in enumerate(bp):
+            if j == -1: continue
+            if bp[j] != i:
+                bpj = bp[j]
+                logger.warning( " ".join([str(_x) for _x in ["Bad pair2", i, j, bp[i], bp[j], dists[i,j], dists[j,i], dists[bpj,j], dists[j,bpj]]]) )
+                raise Exception
+
+        if sequence is not None:
+            seq = sequence
+            bp_seq = sequence[bp]
+            bp_seq[bp==-1] = 'X'
+            bad_bps = np.where( (bp >= 0) &
+                                (((seq == 'C') & (bp_seq != 'G')) |
+                                 ((seq == 'G') & (bp_seq != 'C')) |
+                                 ((seq == 'T') & (bp_seq != 'A')) | 
+                                 ((seq == 'U') & (bp_seq != 'A')) |
+                                 ((seq == 'A') & ((bp_seq != 'T') | (bp_seq != 'U')))
+                                 ) )[0]
+            bp[bp[bad_bps]] = -1
+            bp[bad_bps] = -1
+            
+        return bp
+
+    def _get_stack():
+        dists = distance_matrix( r + 3.5*normal_dir + 2.1*perp_dir -1*base_dir, r ) + np.eye(len(r))*1000
+        stack = np.array([np.argmin(da) for da in dists])
+        for i,j in enumerate(stack):
+            if dists[i,j] > 8:
+                stack[i] = -1
+            elif i < 10:
+                ## development info
+                # devlogger.info([i,j,dists[i,j]])
+                # dr = r[j] - (r[i] - normal_dir[i]*3.4 + perp_dir[i]*1 + base_dir[i]*1)
+                dr = r[j] - r[i]
+                # devlogger.info([normal_dir[i].dot(dr), perp_dir[i].dot(dr), base_dir[i].dot(dr)])
+        return np.array(stack)
+    
+    def _find_vh_vb_table(s,is_scaf):
+        L=[]
+        for i in list(s.keys()):
+            vh,zid=i
+            strand,indices=s[i]
+            if len(indices)==0:
+                continue
+            else:
+                if len(indices)==1:
+                    zids=[str(zid)]
+                else:
+                    zids=[str(zid)+"."+str(j) for j in range(len(indices))]
+                for index,z in zip(indices,zids):
+                    L.append(pd.Series({"index":index,"vh":vh,"zid":z,"strand":strand,"is_scaf":bool(is_scaf)}))
+        return L
+    def get_virt2nuc(virt2nuc,top_data):
+        sys.modules["libs"]=libs
+        logger.info("""You are using a feature that depends on the tacoxdna library: A. Suma et al., 2019, "TacoxDNA: A user‐friendly web server for simulations of complex DNA structures, from single strands to origami", J. Comput. Chem. 40, 2586""")
+        virt_pickle=open(virt2nuc,"rb")
+        vh_vb,pattern=pickle.load(virt_pickle)
+        L1=_find_vh_vb_table(vh_vb._scaf,1)
+        L2=_find_vh_vb_table(vh_vb._stap,0)
+        nt_prop=pd.DataFrame(L1+L2,dtype=object)
+        nt_prop["index"]=nt_prop["index"].astype(int)
+        nt_prop.set_index("index",inplace=True)
+        nt_prop.sort_index(inplace=True)
+        nt_prop["threeprime"]=top_data[2]
+        nt_prop["seq"]=top_data[1]
+        nt_prop["stack"]=top_data[2]
+        vh_bool=1-(nt_prop["vh"]%2)*2
+        is_scaf_bool=nt_prop["is_scaf"]*2-1
+        nt_prop["fwd"]=np.array((is_scaf_bool.T*vh_bool+1)/2,dtype=bool)
+        bp_map=dict(zip(zip(nt_prop["vh"],nt_prop["zid"],nt_prop["fwd"]),nt_prop.index))
+
+        for i in nt_prop.index:
+            if nt_prop.loc[i]["threeprime"] in nt_prop.index:
+                if nt_prop.loc[nt_prop.loc[i]["threeprime"]]["vh"]!=nt_prop.loc[i]["vh"]:   
+                    nt_prop["stack"][i]=-1
+        
+        bp=-np.ones(len(nt_prop.index),dtype=int)
+        counter=0
+        for i,j,k in zip(nt_prop["vh"],nt_prop["zid"],nt_prop["fwd"]):
+            try:
+                bp[counter]=bp_map[(i,j,not(k))]
+            except:
+                pass
+            counter+=1
+        nt_prop["bp"]=bp
+        
+        non_stack_ind,=np.where(nt_prop["stack"]==-1)
+        for i in non_stack_ind:
+            zid=int(nt_prop.loc[i]["zid"])+int(nt_prop.loc[i]["fwd"])*2-1
+            try:
+                nt_prop["stack"][i]=bp_map[(nt_prop.loc[i]["vh"],str(zid),nt_prop.loc[i]["fwd"])]
+            except:
+                continue
+
+        return nt_prop
+    try:
+        nt_prop=get_virt2nuc(virt2nuc,top_data)
+
+        r=conf_data[:,:3] * 8.518
+        base_dir = conf_data[:,3:6]
+        # basepair_pos = r + base_dir*6.0
+        basepair_pos = r + base_dir*10.0
+        normal_dir = -conf_data[:,6:9]
+        perp_dir = np.cross(base_dir, normal_dir)
+        orientation = np.array([np.array(o).T.dot(_yrot) for o in zip(perp_dir,-base_dir,-normal_dir)])
+        seq=nt_prop["seq"]
+        bp=nt_prop["bp"]
+        stack=nt_prop["stack"]
+        three_prime=nt_prop["threeprime"]
+        nt_prop["orientation"]=orientation
+        nt_prop["r"]=list(r)
+    except:
+        if virt2nuc is not None:
+            logger.info("warning: virt2nuc not read. guessing structure...")
+        ## Reverse direction so indices run 5'-to-3'
+        top_data = [a[::-1] for a in top_data]
+        conf_data = conf_data[::-1,:]
+    
+        r = conf_data[:,:3] * 8.518
+        base_dir = conf_data[:,3:6]
+        # basepair_pos = r + base_dir*6.0
+        basepair_pos = r + base_dir*10.0
+        normal_dir = -conf_data[:,6:9]
+        perp_dir = np.cross(base_dir, normal_dir)
+        orientation = np.array([np.array(o).T.dot(_yrot) for o in zip(perp_dir,-base_dir,-normal_dir)])
+        seq = top_data[1]
+        bp = _get_bp(seq)
+        stack = _get_stack()
+      
+        three_prime = len(r) - top_data[2] -1
+        five_prime = len(r) - top_data[3] -1
+        three_prime[three_prime >= len(r)] = -1
+        five_prime[five_prime >= len(r)] = -1
+        nt_prop=pd.DataFrame({"r":list(r),"bp":list(bp),"stack":list(stack),"threeprime":list(three_prime), "seq":list(seq),"orientation":list(orientation)})
+
+    def _debug_write_bonds():
+        from ..arbdmodel import ParticleType, PointParticle, ArbdModel, Group
+        bond = tuple()
+        b_t = ParticleType('BASE')
+        p_t = ParticleType('PHOS')
+
+        parts = []
+        for i,(r0,r_bp,three_prime0,bp0,stack0,seq0) in enumerate(zip(r,basepair_pos, three_prime, bp, stack, seq)):
+            p = PointParticle(p_t, name='PHOS', position = r0, resid=i)
+            b = PointParticle(b_t, name=seq0, position = 0.5*(r0+r_bp), resid=i)
+            parts.extend((p,b))
+
+        model = ArbdModel(parts)
+        model.writePdb('test.pdb')
+
+        for i,(r0,r_bp,three_prime0,bp0,stack0) in enumerate(zip(r,basepair_pos, three_prime, bp, stack)):
+            model.add_bond(parts[2*i],parts[2*i+1],bond)
+            j = three_prime0
+            if j >= 0:
+                model.add_bond(parts[2*i],parts[2*j],bond)
+            j = bp0
+            if j >= 0:
+                model.add_bond(parts[2*i+1],parts[2*j+1],bond)
+        model.writePsf('test.psf')
+
+        model.bonds = []
+        for i,(r0,r_bp,three_prime0,bp0,stack0) in enumerate(zip(r,basepair_pos, three_prime, bp, stack)):
+            j = stack0
+            if j >= 0:
+                model.add_bond(parts[2*i],parts[2*j],bond)
+        model.writePsf('test.stack.psf')
+    ## _debug_write_bonds()
+
+    logger.info(f'mrdna_model_from_oxdna: num_bp, num_ss_nt, num_stacked: {np.sum(bp>=0)//2} {np.sum(bp<0)} {np.sum(stack >= 0)}')
+
+    model = model_from_basepair_stack_3prime( r, bp, stack, three_prime, seq, orientation, **model_parameters )
+     
+    """
+    model.DEBUG = True
+    model.generate_bead_model(1,1,False,True,one_bead_per_monomer=True)
+    for seg in model.segments:
+        for bead in seg.beads:
+            bead.position = bead.position + np.random.standard_normal(3)
+    
+    simulate( model, output_name='test', directory='test4' )
+    """
+    model._dataframe=nt_prop
+    return model
+
+if __name__ == "__main__":
+    mrdna_model_from_oxdna("0-from-collab/nanopore.oxdna","0-from-collab/nanopore.top")
+    # mrdna_model_from_oxdna("2-oxdna.manual/output/from_mrdna-oxdna-min.last.conf","0-from-collab/nanopore.top")