Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Cif reader #4681

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
89 changes: 89 additions & 0 deletions package/MDAnalysis/coordinates/CIF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@

# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

"""
PDBx (mmcif) files in MDAnalysis --- :mod:`MDAnalysis.coordinates.PDBx`
=======================================================================

Reads coordinates from a PDBx_ (mmcif) format file. Will populate the Universe positions from the
``_atom_site.Cartn_x`` field in the PDBx file. Will populate the unitcell dimensions from the ``_cell`` section.


.. _PDBx:
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/beginner’s-guide-to-pdb-structures-and-the-pdbx-mmcif-format
"""
import gemmi
import numpy as np

from . import base




class PDBxReader(base.SingleFrameReaderBase):
format = ['cif', 'pdbx']
units = {'time': None, 'length': 'Angstrom'}

def _read_first_frame(self):
doc = gemmi.cif.read(self.filename)

block = doc.sole_block()

coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z'])
self.natoms = len(coords)

xyz = np.zeros((self.natoms, 3), dtype=np.float32)

for i, (x, y, z) in enumerate(coords):
xyz[i, :] = x, y, z

ts = self.ts = base.Timestep.from_coordinates(xyz, **self._ts_kwargs)
ts.frame = 0

box = block.find('_cell.', ['length_a', 'length_b', 'length_c',
'angle_alpha', 'angle_beta', 'angle_gamma'])
if box:
unitcell = np.zeros(6, dtype=np.float64)
unitcell[:] = box[0]

ts.dimensions = unitcell

if self.convert_units:
# in-place !
self.convert_pos_from_native(self.ts._pos)
if self.ts.dimensions is not None:
self.convert_pos_from_native(self.ts.dimensions[:3])



@staticmethod
def parse_n_atoms(filename, **kwargs):
doc = gemmi.cif.read(self.filename)
block = doc.sole_block()
coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z'])
natoms = len(coords)
del doc
return n_atoms


101 changes: 101 additions & 0 deletions package/MDAnalysis/coordinates/PDBx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#

"""
PDBx (mmcif) files in MDAnalysis --- :mod:`MDAnalysis.coordinates.PDBx`
=======================================================================

Reads coordinates from a PDBx_ (mmcif) format file. Will populate the Universe positions from the
``_atom_site.Cartn_x`` field in the PDBx file. Will populate the unitcell dimensions from the ``_cell`` section.


.. _PDBx:
https://pdb101.rcsb.org/learn/guide-to-understanding-pdb-data/beginner’s-guide-to-pdb-structures-and-the-pdbx-mmcif-format
"""
import gemmi
import numpy as np
from gemmi import FractionalBox

from . import base


class PDBxReader(base.SingleFrameReaderBase):
format = ['cif', 'pdbx']
units = {'time': None, 'length': 'Angstrom'}


def __init__(self, filename, convert_units=True, **kwargs):
super().__init__(filename, convert_units=convert_units, **kwargs)

def _read_first_frame(self):
doc = gemmi.cif.read(self.filename)

block = doc.sole_block()

# PDBx/mmCIF with _cell. sections
box = block.find('_cell.', ['length_a', 'length_b', 'length_c',
'angle_alpha', 'angle_beta', 'angle_gamma'])
# CIF file with _cell_ sections
if not box:
box = block.find('_cell_', ['length_a', 'length_b', 'length_c',
'angle_alpha', 'angle_beta', 'angle_gamma'])

if box:
unitcell = np.zeros(6, dtype=np.float64)
unitcell[:] = box[0]

# PDBx/mmCIF with _cell. sections
coords = block.find('_atom_site.', ['Cartn_x', 'Cartn_y', 'Cartn_z'])
fractional = lambda xyz: xyz

# CIF file with _cell_ sections
if not coords:
coords = block.find('_atom_site_', ['fract_x', 'fract_y', 'fract_z'])
fractional = lambda xyz: np.multiply(xyz, unitcell[:3])

if not coords:
raise ValueError("No coordinates found in the file")

self.n_atoms = len(coords)

xyz = np.zeros((self.n_atoms, 3), dtype=np.float32)

for i, (x, y, z) in enumerate(coords):
xyz[i, :] = x, y, z

# for CIF: multiply fractional coordinates by unitcell lengths
xyz = fractional(xyz)

ts = self.ts = base.Timestep.from_coordinates(xyz, **self._ts_kwargs)
ts.frame = 0
ts.dimensions = unitcell

if self.convert_units:
# in-place !
self.convert_pos_from_native(self.ts._pos)
if self.ts.dimensions is not None:
self.convert_pos_from_native(self.ts.dimensions[:3])

return ts


1 change: 1 addition & 0 deletions package/MDAnalysis/coordinates/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,3 +791,4 @@ class can choose an appropriate reader automatically.
from . import NAMDBIN
from . import FHIAIMS
from . import TNG
from . import PDBx
124 changes: 124 additions & 0 deletions package/MDAnalysis/topology/PDBxParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
# doi: 10.25080/majora-629e541a-00e
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#
"""
PDBx topology parser
====================


See Also
--------
:class:`MDAnalysis.coordinates.PDBx`

"""
import gemmi
import numpy as np

from .base import TopologyReaderBase, change_squash
from ..core.topology import Topology
from ..core.topologyattrs import (
Atomnames,
Atomids,
AltLocs,
Elements,
ICodes,
RecordTypes,
Resids,
Resnames,
Segids,
)


class PDBxParser(TopologyReaderBase):
"""Read a Topology from a PDBx file

Creates the following attributes from these "_atom_site" PDBx loop entries
- "group_PDB" RecordType
- "id" AtomId
- "label_alt_id" AltLoc
- "label_type_symbol" Element
- "label_atom_id" AtomName
- "auth_seq_id" Resid
- "auth_comp_id" Resname
- "pdbx_PDB_ins_code" ICode
- "auth_asym_id" ChainID
"""
format = ['PDBx', 'cif']

def parse(self, **kwargs) -> Topology:
doc = gemmi.cif.read(self.filename)
block = doc.sole_block()

attrs = []

def objarr(x):
return np.array(x, dtype=object)

# hierarchy correspondence:
# seq_id -> residues
# entity_id -> chains
if recordtypes := block.find('_atom_site.', ['group_PDB']):
attrs.append(RecordTypes(recordtypes))
ids = block.find_loop('_atom_site.id')
n_atoms = len(ids)
attrs.append(Atomids(ids))
if altlocs := block.find_loop('_atom_site.label_alt_id'):
altlocs = np.array(altlocs, dtype=object)
altlocs[altlocs == '.'] = ''
attrs.append(AltLocs(altlocs))
if elements_loop := block.find_loop('_atom_site.type_symbol'):
attrs.append(Elements(objarr(elements_loop)))
if names_loop := block.find_loop('_atom_site.label_atom_id'):
attrs.append(Atomnames(objarr(names_loop)))

# sort out residues/segments
# label_seq_id seems to not cover entire model unlike author versions
resids = np.array(block.find_loop('_entity_poly_seq.num'))
resnames = np.array(block.find_loop('_entity_poly_seq.mon_id'))
icodes = np.array(block.find_loop('_atom_site.pdbx_PDB_ins_code'))
chainids = np.array(block.find_loop('_atom_site.auth_asym_id'))

try:
residx, (resids, icodes, resnames, chainids) = change_squash(
(resids, icodes), (resids, icodes, resnames, chainids)
)
segidx, (chainids,) = change_squash((chainids,), (chainids,))
except IndexError:
...
attrs.extend((
Resids(resids),
Resnames(objarr(resnames)),
ICodes(objarr(icodes)),
Segids(chainids),
))

n_residues = len(resids)
n_segments = len(chainids)

return Topology(
n_atoms=n_atoms,
n_res=n_residues,
n_seg=n_segments,
attrs=attrs,
atom_resindex=residx,
residue_segindex=segidx,
)
1 change: 1 addition & 0 deletions package/MDAnalysis/topology/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,4 @@
from . import MinimalParser
from . import ITPParser
from . import FHIAIMSParser
from . import PDBxParser
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.. automodule:: MDAnalysis.coordinates.PDBx
:members:
8 changes: 8 additions & 0 deletions testsuite/MDAnalysisTests/coordinates/test_cif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pytest
from MDAnalysisTests.datafiles import PDBX, CIF, MMCIF



def test_pdbx()

def test_
Loading