Added script for building model from pdb

56790814 · cmaffeo2 · e138ec52 · 56790814
Commit 56790814 authored 6 years ago by cmaffeo2
--- a/segmentmodel_from_pdb.py
+++ b/segmentmodel_from_pdb.py
+# -*- coding: utf-8 -*-
+
+import pdb
+import numpy as np
+import os,sys
+from glob import glob
+import re
+
+import MDAnalysis as mda
+import MDAnalysis.analysis.align as align
+from MDAnalysis.analysis.distances import distance_array
+
+from segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSegment
+
+arbd="/home/cmaffeo2/development/cuda/arbd.dbg/src/arbd" # reduced the mem footprint cause vmd
+namd="/home/cmaffeo2/development/namd-bin/NAMD_Git-2017-07-06_Linux-x86_64-multicore-CUDA/namd2"
+
+def readSequenceFile(sequenceFile='resources/cadnano2pdb.seq.m13mp18.dat'):
+    seq = []
+    with open(sequenceFile) as ch:
+        for l in ch:
+            l = l.strip().replace(" ", "")
+            if l[0] in (";","#"): continue
+            seq.extend([c.upper() for c in l])
+    return seq
+
+m13seq = readSequenceFile(sequenceFile='resources/cadnano2pdb.seq.m13mp18.dat')
+
+resnames = dict(A='ADE DA DA5 DA3', T='THY DT DT5 DT3',
+                C='CYT DC DC5 DC3', G='GUA DG DG5 DG3')
+bp_bond_atoms = dict( A='N1', T='H3',
+                      C='N3', G='H1' )
+bases = resnames.keys()
+resname_to_key = {n:k for k,v in resnames.items() for n in v.split()}
+complement = dict(A='T', C='G', T='A', G='C')
+
+refUnis = {b:mda.Universe("resources/generate-nts/1-w3dna/{}{}.pdb".format(b,complement[b]))
+           for b in bases}
+
+stack_text = "not name O5' P O1P O2P OP1 OP2 O3' H*"
+ref_name_ids_inv = dict()
+for key,val in refUnis.items():
+    ref_res = val.residues[0]
+    names = ref_res.atoms.select_atoms( stack_text ).atoms.names
+    ref_name_ids_inv[key] = np.argsort(np.argsort(names))
+
+    
+
+## TODO: possibly improve ref_stack_position
+# avg_stack_position = [ru.residues[0].atoms.select_atoms(stack_text).center_of_mass()
+#                        for k,ru in refUnis.items()]
+# avg_stack_position = np.mean(avg_stack_position,axis=0)
+# print(avg_stack_position)
+
+## Obtained by apply stack transform to base
+ref_stack_position = np.array((-2.41851735, -0.259761333, 3.39999978))
+ref_bp_position = np.array((0, -8.7434375, -0.326354))
+
+
+def residues_within( cutoff, coords1, coords2, sel1, sel2, universe, selection_text ):
+    distance_matrix = distance_array( coords1, coords2 )
+    if sel1 == sel2:
+        ## ignore diagonal
+        distance_matrix = distance_matrix + np.eye(len(distance_matrix))*10*cutoff
+    # arr_idxs = np.where((distance_matrix > 0) * (distance_matrix < cutoff))
+    arr_idxs = np.where((distance_matrix < cutoff))
+    arr_idx1,arr_idx2 = arr_idxs
+
+    atoms = universe.select_atoms(selection_text)
+    within = -np.ones( atoms.resindices[-1]+1, dtype=np.int )
+
+    if len(arr_idx1) == len(np.unique(arr_idx1)) and len(arr_idx2) == len(np.unique(arr_idx2)):
+        within[sel1.resindices[arr_idx1]] = sel2.resindices[arr_idx2]
+    else:
+        unique,ids,counts = np.unique(arr_idx1, return_index=True, return_counts=True)
+        res1 = sel1.resindices[unique[counts == 1]]
+        res2 = sel2.resindices[arr_idx2[ids[counts == 1]]]
+        within[res1] = res2
+        for i in np.where(counts > 1)[0]:
+            ids2 = arr_idx1 == unique[i]  
+            ij = np.argmin(distance_matrix[arr_idx1[ids2],arr_idx2[ids2]])
+            res1 = sel1.resindices[arr_idx1[ids2][ij]]
+            res2 = sel2.resindices[arr_idx2[ids2][ij]]
+            within[res1] = res2
+
+
+
+        # within[sel1.resindices[arr_idx1]] = sel2.resindices[arr_idx2]
+    return within
+
+def find_base_position_orientation( u, selection_text='all' ):
+    ## Find orientation and center of each nucleotide
+    transforms,centers = [[],[]]
+    for r in u.select_atoms(selection_text).residues:
+        key = resname_to_key[r.resname]
+        ref_res = refUnis[key].residues[0]
+        ref,sel = [res.atoms.select_atoms( stack_text ) for res in (ref_res,r)]
+        assert(len(ref) == len(sel))
+        
+        ## Find order for sel 
+        inv = ref_name_ids_inv[key]
+        # sel_order = inv[np.argsort(sel.atoms.names)]
+        sel_order = np.argsort(sel.atoms.names)[inv]
+        assert( (ref.atoms.names == sel.atoms.names[sel_order]).all() )
+
+        ## Reorder coordinates in sel so atom names match 
+        c = sel.atoms.center_of_mass()
+        R,rmsd = align.rotation_matrix(sel.positions[sel_order] - c,
+                                       ref.positions - ref.atoms.center_of_mass())                                
+        R = np.array(R)         # convert from mda matrix to np array
+        transforms.append(R)
+        centers.append(c)
+
+    centers = np.array(centers, dtype=np.float32)
+    transforms = np.array(transforms)
+    return centers,transforms
+
+def find_basepairs( u, centers, transforms, selection_text='all' ):
+    bonds = { b:"resname {} and name {}".format(resnames[b],bp_bond_atoms[b])
+              for b in bases }
+
+    sel1 = u.select_atoms("({}) and (({}) or ({}))".format(selection_text,
+                                                           bonds['A'],bonds['C'])) 
+    sel2 = u.select_atoms("({}) and (({}) or ({}))".format(selection_text,
+                                                           bonds['T'],bonds['G']))
+    
+    possible_basepairs = residues_within( cutoff = 2.2, 
+                                          coords1=sel1.positions,
+                                          coords2=sel2.positions,
+                                          sel1 = sel1, sel2 = sel2,
+                                          universe = u,
+                                          selection_text = selection_text )
+    ## Filter by expected position
+    ids = possible_basepairs >= 0
+    possible_resI = np.where( ids )[0]
+    possible_resJ = possible_basepairs[ ids ].astype(int) 
+    resI,resJ = [[],[]]
+    for i,j,R1,c1,c2 in zip(possible_resI,possible_resJ,
+                            transforms[possible_resI],
+                            centers[possible_resI],
+                            centers[possible_resJ]):
+        c2_expected = c1 + ref_bp_position.dot(R1)
+        if np.linalg.norm(c2_expected-c2) < 1:
+            resI.append(i)
+            resJ.append(j)        
+        
+    resI = np.array(resI, dtype=np.int)
+    resJ = np.array(resJ, dtype=np.int)
+
+    ## Add reciprocal basepairs
+    assert( (possible_basepairs[resJ]  == -1).all() )
+    basepairs = -np.ones(possible_basepairs.shape, dtype=np.int)
+    basepairs[resI] = resJ
+    basepairs[resJ] = resI
+
+    return basepairs
+
+def find_stacks( u, centers, transforms, selection_text='all' ):
+
+    ## Find orientation and center of each nucleotide
+    expected_stack_positions = []
+    for R,c in zip(transforms,centers):
+        expected_stack_positions.append( c + ref_stack_position.dot(R) )
+
+    expected_stack_positions = np.array(expected_stack_positions, dtype=np.float32)
+       
+    sel = u.select_atoms("({}) and name C1'".format( selection_text ))
+    stacks_above = residues_within( cutoff = 3.5,
+                                    coords1=centers,
+                                    coords2=expected_stack_positions,
+                                    sel1 = sel, sel2 = sel,
+                              universe = u, selection_text = selection_text )
+    return stacks_above
+
+def basepairs_and_stacks_to_helixmap(basepairs,stacks_above):
+
+    helixmap = -np.ones(basepairs.shape, dtype=np.int)
+    helixrank = -np.ones(basepairs.shape)
+    is_fwd = np.ones(basepairs.shape, dtype=np.int)
+    
+    ## Remove stacks with nts lacking a basepairs
+    nobp = np.where(basepairs < 0)[0]
+    stacks_above[nobp] = -1
+    stacks_with_nobp = np.in1d(stacks_above, nobp)
+    stacks_above[stacks_with_nobp] = -1
+
+    end_ids = np.where( (stacks_above < 0)*(basepairs >= 0) )[0]
+
+    hid = 0
+    for end in end_ids:
+        if helixmap[end] >= 0:
+            continue
+        rank = 0
+        nt = basepairs[end]
+        bp = basepairs[nt]
+        assert(helixmap[nt] == -1)
+        assert(helixmap[bp] == -1)
+        helixmap[nt] = helixmap[bp] = hid
+        helixrank[nt] = helixrank[bp] = rank
+        is_fwd[bp] = 0
+        rank +=1
+
+        while stacks_above[nt] >= 0:
+            nt = stacks_above[nt]
+            if basepairs[nt] < 0: break
+            bp = basepairs[nt]
+            assert(helixmap[nt] == -1)
+            assert(helixmap[bp] == -1)
+            helixmap[nt] = helixmap[bp] = hid
+            helixrank[nt] = helixrank[bp] = rank
+            is_fwd[bp] = 0
+            rank +=1
+        hid += 1
+    return helixmap, helixrank, is_fwd
+
+def get_crossovers(helixmap,helixrank,stacks_above):
+    ...
+    
+def update_helixmap_with_ssdna(helixmap,helixrank,stacks_above):
+    ...
+    ss_residues = hmap < 0
+    
+    helixmap
+
+def break_into_contiguous_groups(sequence):
+    from operator import itemgetter
+    from itertools import groupby
+    
+    return [map(itemgetter(1),g) 
+            for i,g in groupby(enumerate(sequence), lambda i,x:i-x)]
+
+
+def SegmentModelFromPdb(*args,**kwargs):
+    u = mda.Universe(*args,**kwargs)
+    for a in u.select_atoms("resname DT* and name C7").atoms:
+        a.name = "C5M"
+
+    ## Find basepairs and base stacks
+    centers,transforms = find_base_position_orientation(u)
+    bps = find_basepairs(u, centers, transforms)
+    stacks = find_stacks(u, centers, transforms)
+
+    ## Build map from residue index to helix index
+    hmap,hrank,fwd = basepairs_and_stacks_to_helixmap(bps,stacks)
+    double_stranded_helices = np.unique(hmap[hmap >= 0])
+
+    ## Add ssDNA to hmap
+    hid = double_stranded_helices[-1]+1
+    ss_residues = hmap < 0
+    assert( np.all(bps[ss_residues] == -1) )
+    for s in u.segments:
+        r = s.atoms.select_atoms("name C1'").resindices
+        ss_residues = r[np.where(hmap[r] < 0)[0]]
+        if len(ss_residues) == 0: continue
+        resid_diff = np.diff(ss_residues)
+        
+        tmp = np.where( resid_diff != 1 )[0]
+        first_res = ss_residues[0]
+        for i in tmp:
+            ids = np.arange(first_res, ss_residues[i]+1)
+            assert( np.all(hmap[ids] == -1) )
+            hmap[ids] = hid
+            hrank[ids] = ids-first_res
+            first_res = ss_residues[i+1]
+            hid += 1
+        ids = np.arange(first_res, ss_residues[-1]+1)
+        assert( np.all(hmap[ids] == -1) )
+        hmap[ids] = hid
+        hrank[ids] = ids-first_res
+        hid+=1
+    print(hmap)
+    single_stranded_helices = np.arange(double_stranded_helices[-1]+1,hid)
+
+    ## Create double-stranded segments
+    doubleSegments = []
+    for hid in double_stranded_helices:
+        maxrank = np.max( hrank[hmap==hid] )
+        start = np.where((hmap == hid) * (hrank == 0))[0]
+        end = np.where((hmap == hid) * (hrank == maxrank))[0]
+
+        start_pos = np.mean( [u.atoms.residues[r].atoms.center_of_mass() for r in start], axis=0 )
+        end_pos = np.mean( [u.atoms.residues[r].atoms.center_of_mass() for r in end], axis=0 )
+
+        seg = DoubleStrandedSegment(name=str(hid),
+                                    num_bp = np.sum(hmap==hid)//2,
+                                    start_position = start_pos,
+                                    end_position = end_pos)
+        assert(hid == len(doubleSegments))
+        doubleSegments.append(seg)
+
+    ## Create single-stranded segments
+    singleSegments = []
+    for hid in single_stranded_helices:
+        maxrank = np.max( hrank[hmap==hid] )
+        start = np.where((hmap == hid) * (hrank == 0))[0]
+        end = np.where((hmap == hid) * (hrank == maxrank))[0]
+
+        start_pos = np.mean( [u.atoms.residues[r].atoms.center_of_mass() for r in start], axis=0 )
+        end_pos = np.mean( [u.atoms.residues[r].atoms.center_of_mass() for r in end], axis=0 )
+
+        seg = SingleStrandedSegment(name=str(hid),
+                                    num_nt = np.sum(hmap==hid),
+                                    start_position = start_pos,
+                                    end_position = end_pos)
+        assert(hid == len(doubleSegments) + len(singleSegments))
+        singleSegments.append(seg)
+
+    ## Find crossovers and 5prime/3prime ends
+    crossovers,prime5,prime3 = [[],[],[]]
+    for s in u.segments:
+        r = s.atoms.select_atoms("name C1'").resindices
+        tmp = np.where(np.diff(hmap[r]) != 0)[0]
+        crossovers.extend( r[tmp] )
+        prime5.append(r[0])
+        prime3.append(r[-1])
+
+    ## Add connections
+    allSegments = doubleSegments+singleSegments
+    for r in crossovers:
+        seg1,seg2 = [allSegments[hmap[i]] for i in (r,r+1)]
+        nt1,nt2 = [hrank[i] for i in (r,r+1)]
+        f1,f2 = [fwd[i] for i in (r,r+1)]
+        if nt1 in (0,seg1.num_nt) or nt2 in (0,seg2.num_nt):
+            seg1.add_crossover(nt1,seg2,nt2,[f1,f2],type_="terminal_crossover")
+        else:
+            seg1.add_crossover(nt1,seg2,nt2,[f1,f2])
+
+    ## Add 5prime/3prime ends
+    for r in prime5:
+        seg = allSegments[hmap[r]]
+        seg.add_5prime(hrank[r],fwd[r])
+    for r in prime3:
+        seg = allSegments[hmap[r]]
+        seg.add_3prime(hrank[r],fwd[r])
+
+    ## Assign sequence
+    for hid in range(len(allSegments)):
+        resids = np.sort(np.where( (hmap==hid)*(fwd==1) )[0])
+        s = allSegments[hid]
+        s.sequence = [resname_to_key[u.residues[r].resname]
+                      for r in resids]
+
+    ## Build model
+    model = SegmentModel( allSegments,
+                          max_basepairs_per_bead = 5, 
+                          max_nucleotides_per_bead = 5, 
+                          local_twist = False,
+                          dimensions=(5000,5000,5000))
+    return model
+
+if __name__ == "__main__":
+    # u = mda.Universe("test-ssdna.psf", "test-ssdna.pdb")
+    # u = mda.Universe("test-ssdna.pdb")
+
+    # model = SegmentModelFromPdb("test-ssdna.pdb")
+    # model = SegmentModelFromPdb("nanoengineer2pdb/nanoengineer2pdb.pdb")
+    model = SegmentModelFromPdb("test-twist.pdb")
+    # model = SegmentModelFromPdb("/data/server7/6hb.pdb")
+
+    model._clear_beads()
+    model._generate_atomic_model(scale=1.0)
+    model._write_atomic_ENM( 'test-from-pdb', lattice_type="honeycomb" )
+
+    model.atomic_simulate( outputPrefix = 'test-from-pdb' )