This post originated from an RSS feed registered with Python Buzz
by Andrew Dalke.
Original Post: matplotlib
Feed Title: Andrew Dalke's writings
Feed URL: http://www.dalkescientific.com/writings/diary/diary-rss.xml
Feed Description: Writings from the software side of bioinformatics and chemical informatics, with a heaping of Python thrown in for good measure.
The most popular plotting package for Python is matplotlib. It has many
features, is flexible and easy to use, and perhaps most importantly it
makes high quality plots.
Here's a plot I made with it. The data source is the PubMed file
compounds_500001_510000.sdf.gz. It contains computed
molecular weights for every compound and XLogP values for many of the
compounds. I plotted MW vs. XLogP for the first 200 compounds with
both values. The first 100 are drawn as blue squares, the second 100
are drawn as red triangles.
Here's the code to make the image. The read_data() function
builds on the work I did in
my previous essay.
from openeye.oechem import *
from pylab import *
# Read up to 'limit' records that have XLogP values
# Return the lists of:
# identifiers, molecular weights, XLogP values
def read_data(ifs, limit = None):
cids = []
weights = []
xlogps = []
for i, mol in enumerate(ifs.GetOEGraphMols()):
# Some of the compounds don't have an XLOGP value
# Skip those molecules
if not OEHasSDData(mol, "PUBCHEM_CACTVS_XLOGP"):
continue
cid = OEGetSDData(mol, "PUBCHEM_COMPOUND_CID")
weight = OEGetSDData(mol, "PUBCHEM_OPENEYE_MW")
xlogp = OEGetSDData(mol, "PUBCHEM_CACTVS_XLOGP")
# double check
if (cid == "" or weight == "" or xlogp == ""):
raise AssertionError( (cid, weight, xlogp) )
cids.append(cid)
weights.append(float(weight))
xlogps.append(float(xlogp))
if limit is not None and len(cids) >= limit:
break
return cids, weights, xlogps
def main():
filename = "/Users/dalke/databases/compounds_500001_510000.sdf.gz"
ifs = oemolistream(filename)
ax = subplot(111)
cids, weights, xlogps = read_data(ifs, 100)
ax.scatter(weights, xlogps)
cids, weights, xlogps = read_data(ifs, 100)
ax.scatter(weights, xlogps, marker = "^", color="red")
ax.set_xlabel("Atomic weight")
ax.set_ylabel("CACTVS XLogP")
show()
if __name__ == "__main__":
OEThrow.SetLevel(OEErrorLevel_Error)
main()
I think the code is straight-forward enough that I don't need to
explain it. For details on the matplotlib calls, follow the links.
I'll modify the plot to have an ellipse for each of the two data sets.
An ellipse for a given data set will be centered at the average of the
molecular weight and XLogP values and use the standard deviation
values for radii. Here's the result:
From what I can tell there is no command to create an ellipse but it's
easy to make a function that creates an N-sided regular polygon that's
close enough. In matplotlib there are lines, patches, and
collections. A polygon is a type of patch so once created I use
add_patch() to add it to the axes.
from openeye.oechem import *
from pylab import *
import stats
# Fake an ellipse using an N-sided polygon
def Ellipse((x,y), (rx, ry), resolution=20, orientation=0, **kwargs):
theta = 2*pi/resolution*arange(resolution) + orientation
xs = x + rx * cos(theta)
ys = y + ry * sin(theta)
return Polygon(zip(xs, ys), **kwargs)
# Read up to 'limit' records that have XLogP values
# Return the lists of:
# identifiers, molecular weights, XLogP values
def read_data(ifs, limit = None):
cids = []
weights = []
xlogps = []
for i, mol in enumerate(ifs.GetOEGraphMols()):
# Some of the compounds don't have an XLOGP value
# Skip those molecules
if not OEHasSDData(mol, "PUBCHEM_CACTVS_XLOGP"):
continue
cid = OEGetSDData(mol, "PUBCHEM_COMPOUND_CID")
weight = OEGetSDData(mol, "PUBCHEM_OPENEYE_MW")
xlogp = OEGetSDData(mol, "PUBCHEM_CACTVS_XLOGP")
if (cid == "" or weight == "" or xlogp == ""):
raise AssertionError( (cid, weight, xlogp) )
cids.append(cid)
weights.append(float(weight))
xlogps.append(float(xlogp))
if limit is not None and len(cids) >= limit:
break
return cids, weights, xlogps
def calculate_ellipse_data(xdata, ydata):
xcenter = stats.lmean(xdata)
xradius = stats.lstdev(xdata)
ycenter = stats.lmean(ydata)
yradius = stats.lstdev(ydata)
return (xcenter, ycenter), (xradius, yradius)
def main():
filename = "/Users/dalke/databases/compounds_500001_510000.sdf.gz"
ifs = oemolistream(filename)
ax = subplot(111)
cids, weights, xlogps = read_data(ifs, 100)
ax.scatter(weights, xlogps)
center, radii = calculate_ellipse_data(weights, xlogps)
ax.add_patch(Ellipse(center, radii, fill=0, edgecolor="blue"))
cids, weights, xlogps = read_data(ifs, 100)
ax.scatter(weights, xlogps, marker = "^", color="red")
center, radii = calculate_ellipse_data(weights, xlogps)
ax.add_patch(Ellipse(center, radii, fill=0, edgecolor="red"))
ax.set_xlabel("Atomic weight")
ax.set_ylabel("CACTVS XLogP")
show()
if __name__ == "__main__":
OEThrow.SetLevel(OEErrorLevel_Error)
main()