01D. Exercises¶

Mingyang Lu¶

12/17/2023¶

How to access PDB data in Python?¶

In [1]:
# a Python function to retrieve PDB file from the provided PDB ID, 
# and save coordinate data to a pandas dataframe
def pdb_to_dataframe(pdb_id):
    from Bio import PDB
    import pandas as pd
    # Download PDB file
    pdbl = PDB.PDBList()
    pdb_file_path = pdbl.retrieve_pdb_file(pdb_id, pdir='.', file_format="pdb")

    # Read PDB file
    pdb_parser = PDB.PDBParser(QUIET=True)
    structure = pdb_parser.get_structure(pdb_id, pdb_file_path)

    # Extract ATOM data
    atom_data = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_data.append({
                        'atom_name': atom.get_name(),
                        'residue_name': residue.resname,
                        'chain_id': chain.id,
                        'residue_number': residue.id[1],
                        'x': atom.coord[0],
                        'y': atom.coord[1],
                        'z': atom.coord[2],
                        'b': atom.bfactor,
                    })

    # Create a DataFrame
    df = pd.DataFrame(atom_data)

    return df
In [2]:
# Example: Retrieve atomic coordinates for PDB ID "7l3u"
pdb_id = "7l3u"
df = pdb_to_dataframe(pdb_id)

# Print the DataFrame
print(df)
Structure exists: './pdb7l3u.ent' 
     atom_name residue_name chain_id  residue_number       x       y       z  \
0            N          VAL        A               1  -8.469  -8.360  10.710   
1           CA          VAL        A               1  -7.840  -7.437  11.652   
2            C          VAL        A               1  -6.372  -7.809  11.847   
3            O          VAL        A               1  -6.051  -8.980  12.024   
4           CB          VAL        A               1  -8.591  -7.388  12.994   
...        ...          ...      ...             ...     ...     ...     ...   
1401         O          HOH        A             441   2.231   6.639  11.973   
1402         O          HOH        A             442   4.713   1.622  14.629   
1403         O          HOH        A             443  -8.433  15.713  -9.062   
1404         O          HOH        A             444  12.564  13.715   1.647   
1405         O          HOH        A             445   4.148   5.888  10.888   

          b  
0     50.87  
1     50.51  
2     49.62  
3     50.06  
4     51.92  
...     ...  
1401  44.39  
1402  42.52  
1403  38.80  
1404  31.93  
1405  38.42  

[1406 rows x 8 columns]
In [4]:
# Print the x coordinates
print(df['x'])
0       -8.469
1       -7.840
2       -6.372
3       -6.051
4       -8.591
         ...  
1401     2.231
1402     4.713
1403    -8.433
1404    12.564
1405     4.148
Name: x, Length: 1406, dtype: float32