diff --git a/docs/source/qml_examples/examples.ipynb b/docs/source/qml_examples/examples.ipynb index caad1725f..48181bb14 100644 --- a/docs/source/qml_examples/examples.ipynb +++ b/docs/source/qml_examples/examples.ipynb @@ -458,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -471,6 +471,7 @@ ], "source": [ "from qml.aglaia.aglaia import ARMP\n", + "import matplotlib.pyplot as plt\n", "import glob\n", "import numpy as np\n", "import os\n", @@ -488,17 +489,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Then, we can create the estimator and specify all the hyper-parameters needed. For example, below we define an estimator that will do 100 training epochs, will use the Atom Centered Symmetry Functions (ACSF) as the representation, and will have a L1 regularisation parameter on the weights of 0.001." + "Then, we can create the estimator and specify all the hyper-parameters needed. For example, below we define an estimator that will do 5000 training epochs, will use the Atom Centered Symmetry Functions (ACSF) as the representation, and will have a L1 and L2 regularisation parameters on the weights of 0.0 and a learning rate of 0.075." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "estimator = ARMP(iterations=100, representation='acsf', representation_params={\"radial_rs\": np.arange(0,10, 1), \"angular_rs\": np.arange(0.5, 10.5, 1),\n", - "\"theta_s\": np.arange(0, 3.14, 1)}, l1_reg=0.001, scoring_function=\"rmse\")" + "acsf_params = {\"nRs2\": 5, \"nRs3\": 5, \"nTs\": 5, \"rcut\": 5, \"acut\": 5, \"zeta\": 220.127, \"eta\": 30.8065}\n", + "estimator = ARMP(iterations=5000, representation_name='acsf', representation_params=acsf_params, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, scoring_function=\"rmse\")" ] }, { @@ -510,20 +511,20 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(100, 19, 270)\n" + "(100, 19, 165)\n" ] } ], "source": [ "estimator.generate_compounds(filenames)\n", - "estimator.generate_representation()\n", + "estimator.generate_representation(method=\"fortran\")\n", "print(estimator.representation.shape)" ] }, @@ -538,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -549,16 +550,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "At this point, we have stored 100 data samples in the class. Let's say that we want to train on 80% of the data and test on 20%. To do this we need to pass a list of indices to the fit method specifying on which samples we would like to train. This can be done as follows:" + "At this point, we have stored 100 data samples in the class. To train the model, we need to specify the indices of the samples we want to train on. Here, since we have only loaded 100 samples, we are going to use all of them." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "idx_train = np.arange(0,80)\n", + "idx_train = np.arange(0,100)\n", "\n", "estimator.fit(idx_train)" ] @@ -572,24 +573,37 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The RMSE is 74 kJ/mol\n" + "The RMSE is 0.05667379826437678 kJ/mol\n" ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAEKCAYAAAA8QgPpAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XucXVV99/HPl0mA4aIDEmiuTcA0NohcnEIQ64OiJKFIoi9pQ/UxYmoeKxSRNkAKT/GCF4wVpLXaKGhURCjGECgSI0JpbROZkEC4pYSAMAmSKCQoTCGXX//Y6yRnJmfOnDlzrjPf9+t1Xmfvtdc565cNM7/Za+29liICMzOz/tqn3gGYmVlzcgIxM7OyOIGYmVlZnEDMzKwsTiBmZlYWJxAzMyuLE4iZmZXFCcTMzMriBGJmZmUZVu8Aqumwww6L8ePH1zsMM7OmsmrVql9HxIi+6g3qBDJ+/Hg6OjrqHYaZWVOR9MtS6rkLy8zMyuIEYmZmZXECMTOzsjiBmJlZWZxAzMysLIP6Liwzs6FmyeqNLFi2jk1buxjV1sq8qZOYefzoqrTlBGJmNkgsWb2R+YvX0rV9JwAbt3Yxf/FagKokkap3YUm6XtJmSQ8VOPY3kkLSYWlfkq6VtF7Sg5JOyKs7W9Lj6TW72nGbmTWbBcvW7U4eOV3bd7Jg2bqqtFeLMZBvA9N6FkoaC7wLeDqveDowMb3mAl9LdQ8FrgBOAk4ErpB0SFWjNjNrMpu2dvWrfKCqnkAi4l7g+QKHrgYuBiKvbAbwncisANokjQSmAssj4vmIeAFYToGkZGY2lI1qa+1X+UDV5S4sSWcBGyPigR6HRgPP5O13prLeys3MLJk3dRKtw1u6lbUOb2He1ElVaa/mg+iSDgAuA04vdLhAWRQpL/T9c8m6vxg3blyZUZqZNZ/cQPlgvgvrKGAC8IAkgDHA/ZJOJLuyGJtXdwywKZWf2qP8nkJfHhELgYUA7e3tBZOMmdlgNfP40VVLGD3VvAsrItZGxOERMT4ixpMlhxMi4lfAUuCD6W6sKcC2iHgWWAacLumQNHh+eiozM7M6qcVtvDcC/wVMktQpaU6R6ncAG4D1wDeAjwFExPPAZ4D70uvTqczMzOpEEYO3l6e9vT28HoiZWf9IWhUR7X3V81xYZmZWFicQMzMrixOImZmVxQnEzMzK4gRiZmZlcQIxM7OyOIGYmVlZnEDMzKwsTiBmZlYWJxAzMyuLE4iZmZXFCcTMzMriBGJmZmVxAjEzs7I4gZiZWVmcQMzMrCy1WJHwekmbJT2UV7ZA0mOSHpT0I0ltecfmS1ovaZ2kqXnl01LZekmXVjtuMzMrrhZXIN8GpvUoWw68MSLeBPw3MB9A0mRgFnB0+sw/SWqR1AJ8FZgOTAbOSXXNzKxOqp5AIuJe4PkeZT+JiB1pdwUwJm3PAH4QEa9ExJNka6OfmF7rI2JDRLwK/CDVNTOzOmmEMZAPAz9O26OBZ/KOdaay3srNzKxO6ppAJF0G7ABuyBUVqBZFygt951xJHZI6tmzZUplAzcxsL3VLIJJmA2cC74+IXDLoBMbmVRsDbCpSvpeIWBgR7RHRPmLEiMoHbmZmQJ0SiKRpwCXAWRHxct6hpcAsSftJmgBMBH4B3AdMlDRB0r5kA+1Lax23mZntMazaDUi6ETgVOExSJ3AF2V1X+wHLJQGsiIiPRsTDkm4GHiHr2jovInam7zkfWAa0ANdHxMPVjt3MzHqnPb1Hg097e3t0dHTUOwwzs6YiaVVEtPdVr88rEEntwB8Do4Au4CHgpxHxfNEPmpnZoNbrGIikD0m6n6y7qRVYB2wG3krW9bRI0rjahGlmZo2m2BXIgcApEdFV6KCk48gGuZ+uRmBmZtbYek0gEfHVYh+MiDWVD8fMzJpFrwlE0rXFPhgRF1Q+HDMzaxbFurBW1SwKMzNrOsW6sBbl70s6OCuO31U9KjMza3h9Poku6Y2SVpPdvvuIpFWSjq5+aGZm1shKmcpkIXBRRPx+RIwD/hr4RnXDMjOzRldKAjkwIu7O7UTEPWS3+JqZ2RBWylxYGyT9f+C7af8DwJPVC8nMzJpBKVcgHwZGAIuBH6Xtc6sZlJmZNb4+r0Ai4gXAz3yYmVk3pU6m+LfA+Pz6EfGm6oVlZmaNrpQxkBuAecBaYFd1wzEzs2ZRSgLZEhFe/c/MzLopZRD9CknflHSOpPfmXqU2IOl6SZslPZRXdqik5ZIeT++HpHJJulbSekkPSjoh7zOzU/3H03rqZmZWR6UkkHOB44BpwLvT68x+tPHt9Nl8lwJ3RcRE4K60DzCdbIr4icBc4GuQJRyypXBPAk4kS2qH9CMGMzOrsFK6sI6NiGPKbSAi7pU0vkfxDLJ10gEWAfcAl6Ty70S2zu4KSW2SRqa6y3OrIEpaTpaUbiw3LjMzG5hSrkBWSJpc4XaPiIhnAdL74al8NPBMXr3OVNZbuZmZ1UkpVyBvBWZLehJ4BRDZrLzVuI1XBcqiSPneXyDNJev+Ytw4r7hrZlYtpSSQnuMXlfCcpJER8WzqotqcyjuBsXn1xgCbUvmpPcrvKfTFEbGQbAJI2tvbCyYZMzMbuF67sCR1SPoK8IfAcxHxy/zXANtdCuTupJoN3JpX/sF0N9YUYFvq4loGnC7pkDR4fnoqMzOzOil2BTKFrPtqGvApSb8h+6X944j471IbkHQj2dXDYZI6ye6m+gJws6Q5wNPA2an6HcAZwHrgZdKcWxHxvKTPAPelep/ODaibmVl9KLvhqYSKWVfTdLKEMhH4r4j4WBVjG7D29vbo6OiodxhmZk1F0qqIaO+rXiljIMDuu6WuB66XtA9w8gDiMzOzJtdrApF0G73c6UR2N9YTkp6OiGd6qWNmZoNYsSuQL/XxuaOBm/GViJnZkNRrAomIfwOQ9OaIWJV/TNK7I+JaSZ7S3cxsiCrlSfRvSNo9lYmkc4DLASLiL6oVmJmZNbZSBtHfB9wi6f1kt/V+kOw5DDMzG8JKWdJ2g6RZwBKy+ahOj4iuqkdmZmYNrdhdWGvpfhfWoUALsFKSl7Q1Mxviil2B9GfNDzMzG2KKJZDfRMTvin1Y0kF91TEzs8Gp2F1Yt0r6e0lvk3RgrlDSkZLmSFpGdWbqNTOzJlDsOZDTJJ0B/D/glDQL7g5gHfCvwOyI+FVtwjQzs0ZT9C6siLiDbIZcMzOzbkp5kNDMzGwvTiBmZlYWJxAzMytLnwlE0lGS9kvbp0q6QFJbJRqX9AlJD0t6SNKNkvaXNEHSSkmPS7pJ0r6p7n5pf306Pr4SMZiZWXlKuQL5IbBT0uuB64AJwPcH2rCk0cAFQHtEvJHsKfdZwFXA1RExEXgBmJM+Mgd4ISJeD1yd6pmZWZ2UkkB2RcQO4D3ANRHxCWBkhdofBrRKGgYcADwLvAO4JR1fBMxM2zPSPun4aZJUoTjMzKyfSkkg29MU7rOB21PZ8IE2HBEbyRateposcWwDVgFbU8IC6ARGp+3RZJM5ko5vA1430DjMzKw8pSSQc8lWHfxsRDwpaQLwvYE2nB5MnEHWJTYKOBCYXqBqbkLHQlcbey25K2mupA5JHVu2bBlomGZm1os+E0hEPAJcAtyf9p+MiC9UoO13Ak9GxJaI2A4sBt4CtKUuLYAxwKa03QmMBUjHXws8XyDehRHRHhHtI0aMqECYZmZWSCl3Yb0bWAPcmfaPk7S0Am0/DUyRdEAayzgNeAS4m2wRK8i6zW5N20vTPun4zyJirysQMzOrjVK6sD4JnAhsBYiINWTdTgMSESvJBsPvB9amWBaSXe1cJGk92RjHdekj1wGvS+UXAZcONAYzMytfKUva7oiIbT1ueKrIX/4RcQVwRY/iDWQJq2fd/wHOrkS7ZmY2cKUkkIck/TnQImki2bMb/1ndsMzMrNGV0oX1V8DRwCvAjcCLwIXVDMrMzBpfn1cgEfEycFl6mZmZAUUSiKRrIuJCSbdRYMwjIs6qamRmVtCS1RtZsGwdm7Z2MaqtlXlTJzHz+NF9f9CswopdgXw3vX+pFoGYWd+WrN7I/MVr6dq+E4CNW7uYv3gtgJOI1VyxJW1Xpfd/q104ZlbMgmXrdiePnK7tO1mwbJ0TiNVcn2MgktaydxfWNqADuDIiflONwMxsb5u2dvWr3KyaSrmN98fATvZM4T6LbF6qbcC3gXdXJTIz28uotlY2FkgWo9pa6xCNDXWl3MZ7SkTMj4i16XUZ8H8i4ipgfHXDM7N886ZOonV4S7ey1uEtzJs6qU4R2VBWSgI5SNJJuR1JJwIHpd0dhT9iZtUw8/jRfP69xzC6rRUBo9ta+fx7j/H4h9VFKV1Yc4BvScoljd8CcyQdCHy+apGZWUEzjx/thGENoWgCkbQPcGREHCPptYAiYmtelZurGp2ZmTWsol1YEbELOD9tb+uRPMzMbAgrZQxkuaS/kTRW0qG5V9UjMzOzhlbKGMiH0/t5eWUBHFn5cMzMrFmUMpnigBePMjOzwaeUJW0PkHS5pIVpf6KkMyvRuKQ2SbdIekzSo5JOTl1kyyU9nt4PSXUl6VpJ6yU9KOmESsRgZmblKWUM5FvAq8Bb0n4ncGWF2v8KcGdEvAE4FniUbKnauyJiInAXe5aunQ5MTK+5wNcqFIOZmZWhlARyVER8EdgOEBFdZFOZDIik1wBvI615HhGvpru8ZgCLUrVFwMy0PQP4TmRWAG2SRg40DjMzK08pCeRVSa2kCRUlHUW2OuFAHQlsIXtIcbWkb6aHE4+IiGcB0vvhqf5o4Jm8z3emMjMzq4NSEsgVwJ3AWEk3kHUrXVyBtocBJwBfi4jjgZfY011VSKGrnr0WupI0V1KHpI4tW7ZUIEwzMyukzwQSEcuB9wIfIlsTvT0i7qlA251AZ0SsTPu3kCWU53JdU+l9c179sXmfHwNsKhDvwohoj4j2ESNGVCBMMzMrpJQrEID9gReAF4HJkt420IYj4lfAM5Jy04ieBjwCLAVmp7LZwK1peynwwXQ31hRgW66ry8zMaq+UBaWuAv4MeBjYlYoDuLcC7f8VcIOkfYENwLlkSe1mSXOAp4GzU907gDOA9cDLqa6ZmdVJKU+izwQmRUQlBs67iYg1QHuBQ6cVqBt0fxrezMzqqJQurA3A8GoHYmZmzaWUK5CXgTWS7iLv9t2IuKBqUZmZWcMrJYEsTS8zM7Pdek0gkl4TES9GxKICx8ZVNywzM2t0xcZA7sltpO6rfEuqEo2ZmTWNYgkk/8nvngtIDXguLDMza27FEkj0sl1o38zMhphig+iHS7qI7Gojt03a9xwhZmZDXLEE8g3g4ALbAN+sWkRmZtYUek0gEfGpWgZiZmbNpdTJFM3MzLpxAjEzs7I4gZiZWVmKPYl+UW/HACLiy5UPx8zMmkWxu7Byd11NAv6IPfNhvZvKrAViZmZNrM+7sCT9BDghIn6b9j8J/EtNojMzs4ZVyhjIOODVvP1XgfGVCkBSi6TVkm5P+xMkrZT0uKSb0mqFSNov7a9PxysWg5mZ9V8pCeS7wC8kfVLSFcBK4DsVjOHjwKN5+1cBV0fERLJ12Oek8jnACxHxeuDqVM/MzOqkzwQSEZ8lW3/8BWArcG5EfK4SjUsaA/wJ6cl2SQLeAdySqiwiW1IXYEbaJx0/LdU3M7M6KPU23gOAFyPiK0CnpAkVav8a4GJgV9p/HbA1Inak/U5gdNoeDTwDkI5vS/XNzKwO+kwgqdvqEmB+KhoOfG+gDUs6E9gcEavyiwtUjRKO5X/vXEkdkjq2bNky0DDNzKwXpVyBvAc4C3gJICI20X1ixXKdApwl6SngB2RdV9cAbZJyd4eNATal7U5gLEA6/lrg+Z5fGhELI6I9ItpHjPCkwWZm1VJKAnk1IoL0176kAyvRcETMj4gxETEemAX8LCLeD9wNvC9Vmw3cmraXpn3S8Z+luMzMrA5KSSA3S/pnsiuDjwA/pbrTuV8CXCRpPdkYx3Wp/Drgdan8IuDSKsZgZmZ9UCl/xEt6F3A62TjEsohYXu3AKqG9vT06OjrqHYaZWVORtCoi2vuqV2wqk9wXXRURlwDLC5SZmdkQ1WcCAd5F1q2Ub3qBMrMBuXzJWm5c+Qw7I2iROOeksVw585h6h2VmvSg2G+9fAh8DjpL0YN6hg4H/rHZgNrRcvmQt31vx9O79nRG7951EzBpTsUH075PNvHtres+93pzuljKrmBtXPtOvcjOrv2Kz8W4Dtkn6CvB83my8B0s6KSJW1ipIG3yWrN7IgmXr2LS1i1Ftrezs5WaO3srNrP5KuY33a8Dv8vZfSmVmZVmyeiPzF69l49YuAti4tavXui2e7sysYZWSQJT/wF5E7KK0wXezghYsW0fX9p0l1T3npLFVjsbMylVKAtkg6QJJw9Pr48CGagdmg9emEq44WiQ+MGWcB9DNGlgpVxIfBa4FLiebzuQuYG41g7LBbVRba8Fuq9Ftrfz80nfUISIzK0cp64FsjohZEXF4RBwREX8eEZtrEZwNTvOmTqJ1eEu3stbhLcybOqlOEZlZOYo9B3JxRHxR0j9QYNr0iLigqpHZoDXz+GyJl/y7sOZNnbS73MyaQ7EurNwys55Myipu5vGjnTDMmlyx50BuS++LeqtjZmZDV7EurNso0HWVExFnVSUiMzNrCsW6sL6U3t8L/B57lrE9B3iqijGZmVkTKNaF9W8Akj4TEW/LO3SbpHurHpmZmTW0Uh4kHCHpyNyOpAnAgBcblzRW0t2SHpX0cHpAEUmHSlou6fH0fkgql6RrJa2X9KCkEwYag5mZla+UBPIJ4B5J90i6h2zN8gsr0PYO4K8j4g+BKcB5kiaTLVV7V0RMJHtoMbd07XRgYnrNxfNxmZnVVZ9PokfEnZImAm9IRY9FxCsDbTgingWeTdu/lfQoMBqYAZyaqi0C7iFbvGoG8J00L9cKSW2SRqbvMTOzGuvzCkTSAcA84PyIeAAYJ+nMSgYhaTxwPLASOCKXFNL74anaaCB/cYjOVGZmZnVQShfWt4BXgZPTfidwZaUCkHQQ8EPgwoh4sVjVAmV73WYsaa6kDkkdW7ZsqVSYZmbWQykJ5KiI+CKwHSAiuij8y7zfJA0nSx43RMTiVPycpJHp+EggN+9WJ5A/t/cYYFPP74yIhRHRHhHtI0YMeKzfzMx6UUoCeVVSK+mvfUlHAQMeA5Ek4Drg0Yj4ct6hpcDstD2bbEndXPkH091YU4BtHv8wM6ufUqZzvwK4Exgr6QbgFOBDFWj7FOD/AmslrUllfwt8AbhZ0hzgaeDsdOwO4AxgPfAycG4FYjAzszIVTSDpKuExsqfRp5B1XX08In490IYj4j/ovSvstAL1AzhvoO3aHj3XJfeMuGbWH0UTSESEpCUR8WbgX2sUk9VAbl3y3NKyG7d2MX/xWgAnETMrSSljICsk/VHVI7GaKrQuedf2nSxYtq5OEZlZsyllDOTtwEclPQW8RNbtFBHxpmoGZtXV27rkxdYrNzPLV0oCmV71KKzmeluXfFRbax2iMbNm1GsXlqT9JV1I9hT6NGBjRPwy96pZhFYVXpfczAaq2BXIIrKHB/+d7CpkMvDxWgRl1ed1yc1soIolkMkRcQyApOuAX9QmJKsVr0tuZgNRLIFsz21ExI7skRBrZH6uw8xqqVgCOVZSbnJDAa1pP3cX1muqHp2VZMnqjXzqtod54eXdOd/PdZhZ1RVb0ralt2PWON715Xt4fPNLBY/lnutwAjGzaijlNl5rQJcvWcv3VjzdZz0/12Fm1eIE0mSWrN7IRTetYVeJ9f1ch5lVixNIEyn1qiPHz3WYWTU5gTSBy5es5YaVTxN7rb/Yu7bW4XzyrKM9/mFmVeME0sCWrN7IZT9ay0uv7uy7cp6Jhx/I8otOrU5QZmaJE0iD6m93Vc4pRx3KDR85ue+KZmYD5ATSgN7/jf/i508836/P+KrDzGqt6RKIpGnAV4AW4JsR8YU6h1QxxZ7p6I2A908Zx5Uzj6lOUGZmvWiqBCKpBfgq8C6gE7hP0tKIeKS+kQ3MktUbufCmNX1XzHPA8H343Hvf5EFyM6ubpkogwInA+ojYACDpB8AMoGkTSDlXHR/wFYeZNYBmSyCjgWfy9juBk/IrSJoLzAUYN25c7SLrp3KuOvYRfPlPj/NVh5k1hGZLIIWmBO72dERELAQWArS3t/fjyYnaedMVd/LiK/27NXffFvHF9x3r5GFmDaPZEkgnMDZvfwywqU6x9Fs5d1eBb801s8bUbAnkPmCipAnARmAW8Of1Dak0J312Oc/99tV+fUbA1X/mLisza0xNlUDSwlbnA8vIbuO9PiIernNYRZV71eGBcjNrdE2VQAAi4g7gjnrHUYpyxjr2bxGPffaMKkVkZlY5TZdAmkG5Vx3DhJOHmTUNJ5AKKjdxgKciMbPm4wRSIeV0VwG8Zr8WHvzUtCpEZGZWXU4gFfCGy+7gf3b2/5ETX3WYWTNzAhmAcqdcd+Iws8HACaRM5cxhBXCNn+sws0HCCaQMly9Z2+/k4asOMxtsnEDKcOPKZ/qulAzbR3zpbM9hZWaDjxNICZas3siCZevYtLWLUW2t7IzSBsw9h5WZDWZOIH1Ysnoj8xevpWt7dovuxq1dfX7Gt+aa2VCwT70DaHQLlq3bnTxK8YEp45w8zGxI8BVIAZcvWcuNK5/ps6uqRWJnBC0S55w01pMfmtmQ4gTSQ6nPdoxua+Xnl76jBhGZmTUmd2H1UModVq3DW5g3dVINojEza1y+AumhWLeVgFFtrcybOsm35ZrZkOcE0kNuXKNQ+ROf91TrZmY5denCkrRA0mOSHpT0I0ltecfmS1ovaZ2kqXnl01LZekmXViu2c04a269yM7Ohql5jIMuBN0bEm4D/BuYDSJpMts750cA04J8ktUhqAb4KTAcmA+ekuhV35cxj+MCUcbRIQHbl4eVlzcz2VpcurIj4Sd7uCuB9aXsG8IOIeAV4UtJ64MR0bH1EbACQ9INU95FqxHflzGOcMMzM+tAId2F9GPhx2h4N5N8G1ZnKeivfi6S5kjokdWzZsqUK4ZqZGVTxCkTST4HfK3Dosoi4NdW5DNgB3JD7WIH6QeFEV/B2qYhYCCwEaG9v7/8qT2ZmVpKqJZCIeGex45JmA2cCp0Xsvu2pE8gfrR4DbErbvZWbmVkd1OsurGnAJcBZEfFy3qGlwCxJ+0maAEwEfgHcB0yUNEHSvmQD7UtrHbeZme1Rr+dA/hHYD1iu7G6nFRHx0Yh4WNLNZIPjO4DzImIngKTzgWVAC3B9RDxcn9DNzAxAUeLaFs1I0hbgl0WqHAb8ukbh9EejxgWNG1ujxgWNG1ujxgWOrRyVjOv3I2JEX5UGdQLpi6SOiGivdxw9NWpc0LixNWpc0LixNWpc4NjKUY+4GuE2XjMza0JOIGZmVpahnkAW1juAXjRqXNC4sTVqXNC4sTVqXODYylHzuIb0GIiZmZVvqF+BmJlZmYZEAmnk6eMLxFqXdlPbYyXdLelRSQ9L+ngqP1TSckmPp/dDUrkkXZtifVDSCVWOr0XSakm3p/0JklamuG5KD5mSHkS9KcW1UtL4KsfVJumW9P/Yo5JObqBz9on03/IhSTdK2r9e503S9ZI2S3oor6zf50nS7FT/8TSjRTXiaojfGYViyzv2N5JC0mFpv2bnbLeIGPQv4HRgWNq+CrgqbU8GHiB7qHEC8ATZg4otaftIYN9UZ3IN4qxLu3ntjwROSNsHk021Pxn4InBpKr807/ydQTYRpoApwMoqx3cR8H3g9rR/MzArbX8d+Mu0/THg62l7FnBTleNaBPxF2t4XaGuEc0Y24eiTQGve+fpQvc4b8DbgBOChvLJ+nSfgUGBDej8kbR9Shbga4ndGodhS+ViyB6t/CRxW63O2O45q/c/bqC/gPcANaXs+MD/v2DLg5PRallferV4VY6tLu0XiuRV4F7AOGJnKRgLr0vY/A+fk1d9drwqxjAHuAt4B3J5+SH6d90O++9zl/jum7WGpnqoU12vIfkmrR3kjnLPcLNaHpvNwOzC1nucNGE/3X9T9Ok/AOcA/55V3q1epuHocq+vvjEKxAbcAxwJPsSeB1PScRcTQ6MLqoaLTx1dYvdrdS+q+OB5YCRwREc8CpPfDU7VaxnsNcDGwK+2/DtgaETsKtL07rnR8W6pfDUcCW4Bvpe61b0o6kAY4ZxGxEfgS8DTwLNl5WEVjnLec/p6nevyMNNTvDElnARsj4oEeh2oe26BJIJJ+mvp5e75m5NUpdfr43sqrrV7tdg9COgj4IXBhRLxYrGqBsorHK+lMYHNErCqx7Vqex2FkXQxfi4jjgZfIumJ6U7PY0njCDLKullHAgWSrevbWfkP8/5c0xM9mo/3OkHQAcBnwd4UO9xJD1WKr12SKFReDY/r4YvHUhKThZMnjhohYnIqfkzQyIp6VNBLYnMprFe8pwFmSzgD2J+s2ugZokzQs/bWc33Yurk5Jw4DXAs9XIa5cW50RsTLt30KWQOp9zgDeCTwZEVsAJC0G3kJjnLec/p6nTuDUHuX3VCOwBv2dcRTZHwQPKJuIdgxwv6QTi8RWvXNWyf7NRn2Rra/+CDCiR/nRdB8Q20A2GDYsbU9gz4DY0TWIsy7t5rUv4DvANT3KF9B9oPOLaftP6D5o94saxHgqewbR/4Xug8EfS9vn0X0w+OYqx/TvwKS0/cl0vup+zoCTgIeBA1J7i4C/qud5Y+8xkH6dJ7LxnCfJBoMPSduHViGuhvmd0TO2HseeYs8YSE3PWcQQGUQH1pP1Aa5Jr6/nHbuM7O6JdcD0vPIzyO5CeoJsFcVaxVqXdlPbbyW7tH0w71ydQdYPfhfweHo/NNUX8NUU61qgvQYxnsqeBHIk2Xox69Mvxf1S+f5pf306fmSVYzoO6EjnbUn6IW2IcwZ8CngMeAj4bvrFV5fzBtxINhazneyv4jnlnCeyMYn16XVuleJqiN8ZhWLrcfwp9iSQmp2z3MtPopuZWVkGzSC6mZnVlhOImZmVxQnEzMzK4gRiZmZlcQIxM7OyOIFYw5L0Oklr0utXkjbm7e9bwXbeKWlb3nevkfT2Sn1/L222SPr3Cn3XP0qDQ68zAAAEUUlEQVR6S9ruzJ85NpUNk7S1R9lySb83wHZfL2lN2j5O0jcH8n3WfAbNk+g2+ETEb8iesUDSJ4HfRcSX8usoexxXEbFr72/ol7sjYuYAv6ObvKe99xIRO4E/rkAbI4DjI+L8fnzmQODgiPjVQNvPiYg1ko6SNDqyObhsCPAViDWd9JfvQ5K+DtwPjM3/C1vSrNxfw5KOkLRYUoekX0iaUkY71ylbU+PHkvZPxyZKWiZplaR7Jf1BKv+epL+XdDfwOUmHS7pL0v2S/ildRbX1vCqQdGmK70FJf5fKDk5tPpDieF+BMM9mz0R/+bEfIOknks4t8Jl3AD9L9TolfVbSCkn3STohfe4JSR9JdfaR9OUUw9pe4oBstt8/K/X8WvNzArFmNRm4LrIJDIv9xXst2fQY7cCfAr11s7y9RxfW+FQ+iWxql6OBLiB3lbKQbAqQN5NN3f2Ped91FNn8SRcDnwbujIgTgDvIJjXsJs3xNY5s6pHjgLekLqkzgKci4tiIeCOwvEDcp5DNsJvvYLJf5osi4lsFPjMduDNv/6mImAKsAK4jm778LcBn0vGzyc73sWTT+18t6XD21kEFrqqsebgLy5rVExFxXwn13glMShPPARwiqTUiunrU26sLS9LrgfURsTYVrQLGpzGGKcAP8743/2fpX/K61N4KfBYgIm6X9NsCMZ5O9kt9ddo/CPgDsqn0vyDpC8BtEfHzAp8dSTadfL7bgc9FxE0F6pNivyBvf2l6X0u2TshLwEuSdimbmfmtwPdTt9uvJP0H0E42bUe+zRRIkDZ4OYFYs3opb3sX3aes3j9vW8CJEfFqme28kre9k+xnRsCvI+K4EmIrNJV2TwKujIjr9jogtZNdiSyQdHtEfK5HlS66/3sBfg5Ml3Rz9JirSNIkshl688dmcv/GXXT/9+5iz7+3FPuneGyIcBeWNb301/4LaVxiH7IumJyfks0yC2R3C1WgvReAZyW9J33nPpKO7aX6f5B1neW6qg4uUGcZMCcNbiNpjKTDJI0mu3Hgu8CXydYd6elR4PU9yv6WLIldW6D+NAqMmfThXmBWunPsCLJus44C9f6AbNJGGyKcQGywuISsX/8usllLc84DTkmD048AH+nl8z3HQN7TS72cWcBHJT1ANmX6mb3UuwL4E0n3kw1eP0f3KxQi4g6ydURWSFpLtmb5QWRjDvelW2UvBnpefQD8K93Xesg5H3itpM+RXUXkriym0X38oxS3kM3o+wBZQr4oIjYXqPf2FI8NEZ6N16yK0l1bOyJih6S3kg3It1fw+0V2lTM9elk9UtKbgX8gS2D3RsSJlWo/r41W4G7glDRWYkOAE4hZFUl6A9maDi1kVwEfje5L81aijZOB30bEXt1Hks4juwq7ICJ+Wsl2e7QziWx983ur1YY1HicQMzMri8dAzMysLE4gZmZWFicQMzMrixOImZmVxQnEzMzK4gRiZmZl+V8IEgkW813TcQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "idx_test = np.arange(80, 100)\n", + "score = estimator.score(idx_train)\n", + "print(\"The RMSE is %s kJ/mol\" % (str(score)) )\n", "\n", - "score = estimator.score(idx_test)\n", - "print(\"The RMSE is %d kJ/mol\" % (score) )\n", + "energies_predict = estimator.predict(idx_train)\n", "\n", - "energies_predict = estimator.predict(idx_test)" + "plt.scatter(energies, energies_predict)\n", + "plt.xlabel(\"True Energies (kJ/mol)\")\n", + "plt.ylabel(\"Predicted Energies (kJ/mol)\")\n", + "plt.show()" ] }, { @@ -603,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -616,7 +630,7 @@ "zs = data[\"arr_1\"]\n", "energies = data[\"arr_2\"]\n", "\n", - "estimator = ARMP(iterations=100, l2_reg=0.0, scoring_function=\"rmse\")\n", + "estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, scoring_function=\"rmse\")\n", "\n", "estimator.set_representations(representations=representation)\n", "estimator.set_classes(zs)\n", @@ -627,34 +641,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the same way as we did before, now we can use indices to specify on which samples to train, predict and score the model." + "In the same way as we did before, now we can use indices to specify on which samples to train, predict and score the model. Again, since we only have loaded 100 samples, we will train and predict on the same sets." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The RMSE is 50 kJ/mol\n" + "The RMSE is 0.30888228285160746 kJ/mol\n" ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEKCAYAAAAMzhLIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XuUXXV99/H3J8MgE0AnaKBkIE8gxriklESnEI3tEpFrBYKPFyhUij4iT6FIsREitMQKQomote1jGwqWakS5hDFQIAIVaalBEiZhCJiWm5AJQrgEkExhMvk+f+x9wpnJOXv2ZM515vNaa9ac/Tv7nP3NWTP5zv5dvj9FBGZmZuVMqHcAZmbW2JwozMwskxOFmZllcqIwM7NMThRmZpbJicLMzDI5UZiZWSYnCjMzy+REYWZmmXaqdwCV8I53vCOmTZtW7zDMzJrKqlWrno+IycOdNyYSxbRp01i5cmW9wzAzayqSfpXnPHc9mZlZJicKMzPL5ERhZmaZnCjMzCyTE4WZmWUaE7OezMzGm67uXhYtX8eGTX1MaW9j/pEzmTe7oyrXcqIwM2syXd29LFjaQ1//AAC9m/pYsLQHoCrJwl1PZmZNZtHydduSREFf/wCLlq+ryvWcKMzMmsyGTX0jah8tJwozsyYzpb1tRO2j5URhZtZk5h85k7bWlkFtba0tzD9yZlWu58FsM7MmUxiw9qwnMzMra97sjqolhqHc9WRmZpmcKMzMLJMThZmZZXKiMDOzTE4UZmaWyYnCzMwyOVGYmVkmJwozM8tUk0Qh6WpJz0l6qKhtD0l3SPrv9PuktF2Svi3pUUkPSnpvLWI0M7PSanVH8c/AUUPazgfuiogZwF3pMcDRwIz063TgOzWK0czMSqhJooiIe4AXhzQfD1yTPr4GmFfU/i+RWAG0S9q7FnGamdn26jlGsVdEPAOQft8zbe8Ani46b33aNoik0yWtlLRy48aNVQ/WzGy8asTBbJVoi+0aIhZHRGdEdE6ePLkGYZmZjU/1TBTPFrqU0u/Ppe3rgX2LztsH2FDj2MzMLFXPRLEMODV9fCrw46L2T6ezn+YALxe6qMzMrPZqsh+FpGuBDwHvkLQeuAi4DLhO0meBp4BPpKffChwDPApsBk6rRYxmZlZaTRJFRJxU5qnDSpwbwJnVjcjMzPJqxMFsMzNrIE4UZmaWyYnCzMwyOVGYmVkmJwozM8vkRGFmZpmcKMzMLJMThZmZZXKiMDOzTE4UZmaWyYnCzMwyDVvrSVIn8HvAFKAPeAi4MyKG7lhnZtZQurp7WbR8HRs29TGlvY35R85k3uzt9kGzYZRNFJL+GDgbeAJYBawDdgE+CJwn6SHgLyLiqRrEaWY2Ihd29bBkxVPbdj3r3dTHgqU9AE4WI5R1R7ErMDci+ko9KWkWMIOkRLiZWcPo6u4dlCQK+voHWLR8nRPFCJVNFBHx91kvjIjVlQ/HzGz0Fi1ft/3+yakNm0r+7WsZsrqevp31wog4u/LhmJmNTKlxiKxkMKW9rYbRjQ1ZXU+rahaFmdkO6OruZcHSHvr6B4A3xyHaJ7by0ub+7c4XMP/ImTWOsvlldT1dU3wsafekOX5T9ajMzIbR1d3LF69bw0AM7mTq6x/gLTtNoK21ZVsCgSRJnDxnqscndsCw6ygk/bakbpJpsQ9LWiXpgNFeWNJMSauLvl6RdI6khZJ6i9qPGe21zGxsKdxJDE0SBS/39XPpxw6ko70NAR3tbXzzU7O4eN6BtQ10jMizZ/Zi4NyI+CmApA8BVwIfGM2FI2IdMCt9zxagF7gJOA34ZkR8fTTvb2ZjT2E8oneYAekp7W3Mm93hu4cKyZModi0kCYCIuFvSrhWO4zDgsYj4laQKv7WZjQVDxyPKaWtt8ThEheVJFI9L+gvge+nxKSSL8CrpRODaouOzJH0aWAl8MSJeqvD1zKwJdHX3snDZWjb1bT8wXUqLxKUfO9B3EhWWp9bTZ4DJwFKSrqHJJN1DFSFpZ+A44Pq06TvAdJJuqWeAK8q87nRJKyWt3LhxY6XCMbMG0dXdy/zr1+ROEm2tLVzxyYOcJKpg2DuK9K/5aq6ZOBp4ICKeTa/3bOEJSVcCt5SJazHJ+AmdnZ3l1taYWZNatHwd/Vvz/Wp3uI5TVeUtCvhlYFrx+RHxOxWK4SSKup0k7R0Rz6SHJ5DMtjKzceDCrh5+cN9T5MwPtLW2uKupBvKMUSwB5gM9wNZKXlzSROBw4PNFzZendaQCeHLIc2Y2BnV19/KlG9bwxkD+zgHfRdROnkSxMSKWVePiEbEZePuQtj+qxrXMrDF1dfcy/4Y19OdMEq0tYtHHPRZRS3kSxUWS/gm4C3i90BgRS6sWlZmNaclU1wfp6x9ZJ8Wkia1cdOwBThI1lidRnAa8G2jlza6nIJkFZWY2Ihd29fD9FSPbnaCjvY17z/9wlSKy4eRJFAdFhNe9m9modXX3jjhJTMCF/OotzzqKFZLeU/VIzGzMW7R83YjOb50A3/jULHc11VmeO4oPAqdKeoJkjEIkVWQrNT3WzMaJvJsGtbe1svA4j0U0ijyJ4qiqR2FmY9LQTYXK7RNRcMqcqa7w2oCydrhbCdwL3AbcHRH/U7OozKzpldpUqHWCaJkgBkqsqJs7fQ8niQaVdUcxh6Tb6SjgK5JeAJYDt0XEf9UiODNrLsV3EBOk7faL6N8atLe1IrHtzsLdTI0va4e7LcDd6ReS9iapy3SxpBnAzyPiT2oQo5k1gQu7eliy4ikKqSFrU6EnLvuD2gVmo5ZnjAKAtP7S1cDVkiYA769aVGbWVE6+8ufc+9iLuc6d0t5W5Wis0rLGKG4Gyq2pfx14TNJTEfF0VSIzs4bW1d3LeTc+yOtb8q+u9qZCzSnrjiJrK9KdgAOA6/Cdhdm409Xdy7nXrc5V5bVFYmsEU1zEr2lljVH8DEDS+yJiVfFzko6NiG9L8loKs3FmJN1MAm8mNAbkGaO4UtKpEdEDIOkk4Bzg5oj4P1WNzswaxkgSxLbXzJnqJDEG5EkUHwdukHQyyXTZTwNHVDUqM2soO5IkvC5i7MizFerjkk4EuoCngSMiIt86fDNrahd29XDtfU+XnepajldYjy1Zs556GDzraQ+gBbhPUiW3QjWzBvTuC27lf0aw4xwkdxFLPuf5LWNN1h3FR2sWhZk1jK7uXv7sR6vLzo0vxQlibMtKFC9ExG+yXixpt+HOMbPmUajPNJIkMWPPXZ0kxrisRPFjSauBHwOrIuI1AEn7A4cCnwSuBG4YTQCSngReBQaALRHRKWkP4EfANOBJ4JMR8dJormNmpXV197Jw2Vo29ZWv6lrKzi3icu9dPS5kraM4TNIxwOeBuZImAVuAdcC/AqdGxK8rFMehEfF80fH5wF0RcZmk89Pj8yp0LTNLdXX3Mv/6NfTnWTmX8kD1+JM56ykibgVurVEsxY4HPpQ+voakMKEThVkF7cje1bu0yEliHMpdFLCKAviJpAD+MSIWA3ulRQiJiGck7VnXCM3GkB1ZEwGw1+47c98Fh1chImt0jZAo5kbEhjQZ3CHpl3leJOl04HSAqVOnVjM+szHj8G/czX8/91ru8ztcn8logEQRERvS789Jugk4GHhW0t7p3cTewHMlXrcYWAzQ2dk5ssneZuNMV3cvX7phDW/kXBfR2iIWeaDaUhOGO0HSdElvSR9/SNLZktorcXFJu0ravfCYpDTIQ8Ay4NT0tFNJZl6Z2Q64sKuHc360OneSmDSx1UnCBslzR3Ej0CnpncBVJP+J/wA4pgLX3wu4SVIhlh9ExO2S7geuk/RZ4CngExW4ltm4M9IBa89oslLyJIqtEbFF0gnAtyLibyV1V+LiEfE4cFCJ9heAwypxDbPxqqu7lyVOElYBeRJFf1pa/FTg2LSttXohmdmO2tEifh3tbU4SVlaeRHEacAZwSUQ8IWk/4PvVDcvMRqKru5dzf7Sa/JuSvsnbk9pw8pQZf1jSecDU9PgJ4LJqB2Zmw+vq7uUrN6/lpc0jK79R4OmvlsewiULSsST7Z+8M7CdpFvBXEXFctYMzs/Iu7OphyYqnRlTATyS7zrmbyUYiT9fTQpK1DXcDRMTqtPvJzOqkMFCdN0kImOK7B9tBeRLFloh4OZ3CWuAFbmZ10NXdy6Ll6+jdlH+TSc9mstHKkygekvSHQIukGcDZwH9WNywzG2pHuppm7Lmrk4SN2rArs4E/BQ4AXgeuBV4BzqlmUGY22Ei7miYouZO449wPVTMsGyfyzHraDFyQfplZjRS6mTZs6mOCNGyS8EC1VUvZRCHpWxFxjqSbKTEm4VlPZtUztBT4cAvoPM3VqinrjuJ76fev1yIQM0tc2NWTe78IAd/81CwnCKuqrK1QV6Xff1a7cMzs2vueznVeoavJScKqLc+Cux6273p6GVgJXJwW8DOzCsnqZmqR2BrhNRFWU3mmx94GDJCUFgc4keSPmZeBf+bNQoFmVgEtUtlkccUnvU+E1V6eRDE3IuYWHfdIujci5ko6pVqBmY1lxTOaht4dnHTIviX3kJg7fQ8nCauLPOsodpN0SOFA0sHAbunhlqpEZTaGdXX3smBpD72b+gigd1MfC5b20NXdC8DF8w7klDlTaUmrIbRInDJnKks+9/46Rm3jmWKYaXeSOoHv8mZyeBX4LPAw8AcRcV1VI8yhs7MzVq5cWe8wzHKZe9m/lSzB0dHexr3nf7gOEdl4JWlVRHQOd15m15OkCcD+EXGgpLeRJJZNRafUPUmYNZsNZeo0lWs3q7fMrqeI2AqclT5+eUiSMLMdMKW9bUTtZvWWZ4ziDkl/LmlfSXsUvkZ74fT9firpEUlrJX0hbV8oqVfS6vTrmNFey6yRzD9yJm2tLYPavMucNbI8s54+k34/s6gtgP1Hee0twBcj4gFJuwOrJN2RPvfNiPCKcBuTCjOXys16Mms0eYoCVmWTooh4BngmffyqpEcA/6ZY08qa8jrUvNkdTgzWNIbtepI0UdKFkhanxzMkfbSSQUiaBswG7kubzpL0oKSrJU2q5LXMqmG4Ka9mzSzPGMV3gTeAD6TH64GLKxWApN2AG4FzIuIV4DvAdGAWyR3HFWVed7qklZJWbty4sVLhmO2QRcvX0dc/MKitr3+ARcvX1Skis8rJkyimR8TlQD9ARPSRlPAYNUmtJEliSUQsTd//2YgYSGdcXUmyX/d2ImJxRHRGROfkyZMrEY7ZDvOUVxvL8gxmvyGpjbQwoKTpJLvdjYqSTbivAh6JiG8Ute+djl8AnAA8NNprmVVSV3cvC5etZVNfPwCTJrbytrbWbcfFPOXVxoI8ieIi4HZgX0lLgLnAH1fg2nOBPyKpHbU6bfsycJKkWSSJ6Ung8xW4lllFdHX3Mv/6NfRvfbOiwUub+2mZIFonaFC7p7zaWJFn1tMdkh4A5pB0OX0hIp4f7YUj4j8o3YV162jf26xaFi1fNygZFAxsDd46sZWJO+/kKa825uS5owDYBXgpPf89koiIe6oXllljyhpz2LS5n+6/PKKG0ZjVRp6Ni/4a+BSwFtiaNgfgRGHjzpT2tpIF/QrPmY1Fee4o5gEzI2LUA9hmzW7+kTO3G6MAaG2RxyNszMozPfZxoLXagZg1g3mzO1j0iYNob3vzV2LSxFYWfdw7z9nYleeOYjOwWtJdFE2LjYizqxaVWQNz+Q0bb/IkimXpl5mZjUNlE4Wkt0bEKxFxTYnnplY3LDMzaxRZYxR3Fx6k3U7FuqoSjZmZNZysRFG8GG7oRkUVqfVkZmaNLytRRJnHpY7NzGyMyhrM3lPSuSR3D4XHpMcu12pmNk5kJYorgd1LPAb4p6pFZGZmDaVsooiIr9QyEDMza0x5Vmabmdk45kRhZmaZnCjMzCxT1srsc8s9B1C8falZPXR197Jo+TpvFGRWZVmzngqznGYCv8ub9Z6OxXtRWJ0N3ZK0d1Mf869fA+BkYVZhw856kvQT4L0R8Wp6vBC4vibRmZWxcNna7faE6N8aLFy21onCrMLyjFFMBd4oOn4DmFaVaIpIOkrSOkmPSjq/2tez5rKpr39E7Wa24/KUGf8e8AtJN5GU7jgB+JdqBiWpBfh74HBgPXC/pGUR8XA1r2tmZtsb9o4iIi4BTgNeAjYBp0XE16oc18HAoxHxeES8AfwQOL7K17QmMmli6U0Xy7Wb2Y7LOz12IvBKRPwNsF7SflWMCaADeLroeH3aZgbARcceQGvL4CLGrS3iomMPqFNEZmPXsF1Pki4COklmP32XZP/s7wNzqxhXqTLmg0YuJZ0OnA4wdar3URpvCgPWnh5rVn15xihOAGYDDwBExAZJu2e/ZNTWA/sWHe8DbCg+ISIWA4sBOjs7XfZ8jBjJ2gjvXW1WG3kSxRsREZICQNKuVY4J4H5gRtrF1QucCPxhDa5rddTV3cuCpT309Q8AydqIBUt7AK+NMKunPGMU10n6R6Bd0ueAO6lymfGI2AKcBSwHHgGui4i11bym1d+i5eu2JYmCvv4BFi1fV6eIzAxy3FFExNclHQ68QjJO8ZcRcUe1A4uIW4Fbq30daxwbNvWNqN3MaiPPYPZfR8R5wB0l2swqZkp7G70lksKU9rY6RGNmBXm6ng4v0XZ0pQMxm3/kTNpaWwa1tbW2MP/ImXWKyMwgu3rs/wX+BJgu6cGip3YH/rPagdnYkmc2k6e8mjUmRZSeWSrpbcAk4FKguNbSqxHxYg1iy62zszNWrlxZ7zCsjAu7eliy4qlBC2HaWlu49GMHOgmY1ZGkVRHROdx5ZbueIuLliHgS+BvgxYj4VUT8CuiXdEjlQrWxrKu7d7skAZ7NZNZM8qyj+A7w3qLj10q0mQ1S6GoqNThd4NlMZs0hT6JQFPVPRcRWSXleZ+PU0IVz5Xg2k1lzyDPr6XFJZ0tqTb++ADxe7cCseZVaODeUwLOZzJpEnkRxBvABklIa64FDSIvxmZUyXJeSgJPnTPVAtlmTyLMy+zmSWktmuZRbOAfQ4SmvZk0nax3FlyLickl/C9tNWiEizq5qZNa05h85c7sxCk+HNWteWXcUj6TfvUDBBhlu8ZwXzpmNLWUX3DUTL7irnVIzmny3YNac8i64y+p6upkSXU4FEXHcDsZmTSyrFLgThdnYlNX19PX0+8eA3yLZ/hTgJODJKsZkdVbctdQ+sZUIeLmvP3OQ2ovnzMausokiIn4GIOmrEfH7RU/dLOmeqkdmdTG0a+mlzf3bnuvd1IcofZvpxXNmY1eedRSTJe1fOEi3J51cvZCsnoZbLBck6yCKuRS42diWpxTHnwF3Syqsxp4GfL5qEVld5elCCpL1EJ7RZDY+5Flwd7ukGcC706ZfRsTr1Q3L6iVrHKKgo72Ne8//cI0iMrN6G7brSdJEYD5wVkSsAaZK+uhoLippkaRfSnpQ0k2S2tP2aZL6JK1Ov/5hNNexkSu1y1wxdzOZjT95xii+C7wBvD89Xg9cPMrr3gH8dkT8DvBfwIKi5x6LiFnp1xmjvI6N0LzZHVz6sQPpaG9DwKSJrbS3tSKSOwmvlzAbf/KMUUyPiE9JOgkgIvokDR3PHJGI+EnR4Qrg46N5P6usebM7nAzMbJs8dxRvSGojnRUpaTpQyTGKzwC3FR3vJ6lb0s8k/V4Fr2NmZjsgzx3FRcDtwL6SlgBzgT8e7kWS7iRZqDfUBRHx4/ScC4AtwJL0uWeAqRHxgqT3AV2SDoiIV0q8/+mk5c6nTp2a459hw9VoMjMrJbPWU9rFtA+wGZhDMoV+RUQ8P+oLS6eS7HVxWERsLnPO3cCfR0RmISfXehqeazSZ2VB5az1ldj2lW6B2RcQLEfGvEXFLhZLEUcB5wHHFSULSZEkt6eP9gRl4N72KyKrRZGaWJc8YxQpJv1vh6/4dsDtwx5BpsL8PPChpDXADcEZEvFjha49L5RbSuUaTmQ0nzxjFocAZkp4EXiPpfop0ausOiYh3lmm/EbhxR9/Xyiu3kM41msxsOHkSxdFVj8Iq4sKuHq6972kGImiROOmQfbl43oFA+V3nvHjOzIaTtR/FLiSDze8EeoCrImJLrQKzkTn5yp9z72Nv9tINRPD9FU8BcPG8A73rnJntsLKzniT9COgH/p3kruJXEfGFGsaW23ie9dTV3cvCZWvZ1Ndf8vkWiccuPabGUZlZMxj1DnfAeyLiwPTNrgJ+UangrDJKTXkdamAMbHVrZvWVNetp25+o7nJqTMPtHQHJHYWZ2Whk3VEcJKmwIlpAW3pcmPX01qpHZ4MMXVk9XDlwgJMO2bcGkZnZWJa1FWr5WtNWc0O7mbK2JS2YO32PbbOezMx2VJ7psVZHhbuIUncPhW1JhyaLSRNbuejYAzyjycwqwomigeUZrPa2pGZWbU4UDSzPYLW3JTWzastT68nqZLg6TF5ZbWa14DuKBlBun4ismU0d7mYysxpxoqizoaU3ejf18cXr1wDl6zN5DwkzqyV3PdXRhV09g5JEwcDW4IKbepg3u4NLP3YgHe1tiOQuwknCzGrNdxR1dO19T5d97rU3kruIebM7nBjMrK58R1FHrsNkZs3AdxQ1UmrAukUqmyxcocnMGoXvKGqgsHCud1MfQTJgvWBpD3P2n1T2NSfPmVq7AM3MMjhR1ECphXN9/QM8+UIfp8yZut3dwylzprpGk5k1jLp0PUlaCHwO2Jg2fTkibk2fWwB8FhgAzo6I5fWIsZLKLZzbsKmPi+cd6KRgZg2tnmMU34yIrxc3SHoPcCJwADAFuFPSuyIiu45FAym1b3W5hXNT2tvqEKGZ2cg0WtfT8cAPI+L1iHgCeBQ4uM4x5XbylT/n+yue2jZAXdi3etrb22hrHVy13eU3zKxZ1DNRnCXpQUlXSyqM6nYAxYsL1qdt25F0uqSVklZu3Lix1Ck11dXdW3LxHMCKx1/ywjkza1pV63qSdCfwWyWeugD4DvBVkirZXwWuAD5D6VmhJeePRsRiYDFAZ2dn3RckLFq+ruxzAxFeOGdmTatqiSIiPpLnPElXArekh+uB4r079wE2VDi0qsiq9Op9q82smdWl60nS3kWHJwAPpY+XASdKeouk/YAZwC9qHd+OyBqY9r7VZtbM6jVGcbmkHkkPAocCfwYQEWuB64CHgduBM5tlxtP8I2duN2AN3rfazJpfXabHRsQfZTx3CXBJDcOpiML4Q6l9JczMmplrPVWQB6zNbCxyoiij3K5zZmbjjRPFEF3dvXzl5rW8tLl/W1uhiB/gZGFm406jrcyuq0KV1+IkUdDXP5C5VsLMbKxyoihSqsprsay1EmZmY5UTRZHhEoGL+JnZeDSuxyiGDli3T2wt2e0ELuJnZuPXuE0UhfGIQldT76Y+WieI1hbRPzC4dFR7WysLjzvAA9lmNi6N20RRajyif2vQ3tbKrm/ZydNizcxS4zZRlBuPeLmvn9UXHVHjaMzMGte4HcwuNzDtAWszs8HGbaIoVcTPA9ZmZtsbt11PLuJnZpbPuE0U4CJ+ZmZ5jNuuJzMzy8eJwszMMjlRmJlZJicKMzPL5ERhZmaZFBHDn9XgJG0EflXvOEp4B/B8vYPIybFWR7PE2ixxgmOtpP8VEZOHO2lMJIpGJWllRHTWO448HGt1NEuszRInONZ6cNeTmZllcqIwM7NMThTVtbjeAYyAY62OZom1WeIEx1pzHqMwM7NMvqMwM7NMThQVJmmhpF5Jq9OvY4qeWyDpUUnrJB1ZzzjTeBZJ+qWkByXdJKk9bZ8mqa/o3/AP9Y4VQNJR6Wf3qKTz6x1PMUn7SvqppEckrZX0hbS97M9DPUl6UlJPGtPKtG0PSXdI+u/0+6QGiHNm0We3WtIrks5plM9V0tWSnpP0UFFbyc9RiW+nP78PSnpvPWLeEe56qjBJC4HfRMTXh7S/B7gWOBiYAtwJvCsiBrZ7kxqRdATwbxGxRdJfA0TEeZKmAbdExG/XK7ahJLUA/wUcDqwH7gdOioiH6xpYStLewN4R8YCk3YFVwDzgk5T4eag3SU8CnRHxfFHb5cCLEXFZmognRcR59YpxqPRnoBc4BDiNBvhcJf0+8BvgXwq/L+U+xzSZ/SlwDMm/4W8i4pB6xT4SvqOoneOBH0bE6xHxBPAoSdKom4j4SURsSQ9XAPvUM55hHAw8GhGPR8QbwA9JPtOGEBHPRMQD6eNXgUeAZqthfzxwTfr4GpJE10gOAx6LiIZZXBsR9wAvDmku9zkeT5JQIiJWAO3pHxgNz4miOs5Kby2vLrp97wCeLjpnPY31H8lngNuKjveT1C3pZ5J+r15BFWn0z2+b9I5sNnBf2lTq56HeAviJpFWSTk/b9oqIZyBJfMCedYuutBNJ7soLGvFzhfKfY9P8DA/lRLEDJN0p6aESX8cD3wGmA7OAZ4ArCi8r8VZV7/cbJtbCORcAW4AladMzwNSImA2cC/xA0lurHesw6vL5jZSk3YAbgXMi4hXK/zzU29yIeC9wNHBm2oXSsCTtDBwHXJ82NernmqUpfoZLGdc73O2oiPhInvMkXQnckh6uB/YtenofYEOFQ9vOcLFKOhX4KHBYpANWEfE68Hr6eJWkx4B3ASurHG6Wunx+IyGplSRJLImIpQAR8WzR88U/D3UVERvS789Juomka+9ZSXtHxDNpl8hzdQ1ysKOBBwqfZ6N+rqlyn2PD/wyX4zuKChvS53gCUJgNsQw4UdJbJO0HzAB+Uev4ikk6CjgPOC4iNhe1T04HDpG0P0msj9cnym3uB2ZI2i/96/JEks+0IUgScBXwSER8o6i93M9D3UjaNR1wR9KuwBEkcS0DTk1POxX4cX0iLOkkirqdGvFzLVLuc1wGfDqd/TQHeLnQRdXoPOupwiR9j+R2OIAngc8XfhjSLp7PkHTznBMRt5V7n1qQ9CjwFuCFtGlFRJwh6X8Df0US5wBwUUTcXKcwt0lnjXwLaAGujohL6hzSNpI+CPw70ANsTZu/TPIfXMmfh3pJk/9N6eFOwA8i4hJJbweuA6YCTwGfiIihA7U1J2kiSd/+/hHxctpW9vesxrFdC3yIpErss8BFQBclPsf0j4m/A44CNgOnRUQ979Jzc6IwM7NM7noyM7NMThRmZpbJicJ9FqwqAAAEhUlEQVTMzDI5UZiZWSYnCjMzy+REYXUn6e1FVUB/PaQq6M4VvM5HJL2swdVID63U+5e5Zoukf6/Qe/2dpA+kj9crrfZb9PxOkjYNabtD0m+N8rrvlLQ6fTxL0j+N5v2s+XhlttVdRLxAMic+q/quSKZzb93+HUbkpxFR0WJ3knYqKq44SFodeNS1siRNBmZHxFkjeM2uwO4R8evRXr8gIlZLmi6pIyJ6K/W+1th8R2ENK/1L9iEl+2E8AOxb/BezpBMLf91K2kvSUkkrJf0iXfk60utcpWQvidsk7ZI+N0PS8rR43j2S3pW2f1/SFZJ+CnxN0p6S7pL0gKT/l94VtQ/9K1/S+Wl8D0r6y7Rt9/Saa9I4Pl4izE8wuGhj4f0mSvqJpNNKvObDwL+l562XdImkFZLul/Te9HWPSfpces4ESd9IY+gpEwck5TI+lffztebnRGGN7j3AVWmBwqy/YL8NXB4RnSR7QJTrHjl0SNfTtLR9JvCtiDgA6OPN0tCLgT+JiPcBC0hW1hZMJ6mR9SWSley3p4X2biXZc2SQdGX5VJK9CGYBH0i7ko4BnoyIg9I9De4oEfdckj0uiu1O8p/2NRHx3RKvORq4vej4yYiYQ1JS/iqS0hcfAL6aPv8Jks/7IJJ9P74pqVQF2ZVU4C7Jmoe7nqzRPRYR9+c47yPAzKSHCoBJktoiom/Iedt1PUl6J8leFz1p0ypgWjoGMAe4seh9i39nri/qCvsgcAlARNwi6dUSMR5B8p93d3q8G0mxxfuAyyRdBtwcEfeWeO3ewMYhbbcAX4uIH5U4nzT2s4uOC7WxeoCdIuI14DVJW5VUvf0gSTmPAeDXkv4D6CTZMKrYc5RIhDZ2OVFYo3ut6PFWBpdq3qXosYCD002NdsTrRY8HSH43BDwfEbNyxFaqhPRQAi6OiKu2e0LqJLmzWCTploj42pBT+hj87wW4Fzha0nWFyr9F7zcTeGLI2Enh37iVwf/erbz5781jlzQeGyfc9WRNI/3r/aV03GACSddJwZ3AmYUDSeX+cx/J9V4CnpF0QvqeEyQdVOb0/yDp8ip0Me1e4pzlwGfTQWYk7SPpHZI6SAbwvwd8Ayi1l/IjwDuHtH2ZJFl9u8T5R1FiTGMY95BUOG6RtBdJd1eponXvorGqtVqVOVFYszmPpN/9LpL6/gVnAnPTQeKHgc+Vef3QMYoTypxXcCJwhqQ1wFqSvTtKuQj4A0kPkAwiP8vgOw4i4lbgBmCFpB6SCqO7kYwJ3J9OQf0SMPRuAuBfSaqUDnUW8DZJXyO5KyjcKRzF4PGJPG4AfgmsIUm850ZEqT0pDk3jsXHC1WPNKiCdJbUlIrYoKTn+rXRgvVLvL5K7lqPTnfNKnfM+4G9JEtU9EVHxPdkltQE/Jdkhb6DS72+NyYnCrAIkvZtkY50Wkr/qz4iIobOURnuN9wOvRsR23T6SziS5qzo7Iu6s5HWHXGcmyZ7Q91TrGtZ4nCjMzCyTxyjMzCyTE4WZmWVyojAzs0xOFGZmlsmJwszMMjlRmJlZpv8PHve8+xg2OpEAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "idx_train = np.arange(0,80)\n", + "idx_train = np.arange(0,100)\n", "\n", "estimator.fit(idx_train)\n", "\n", - "idx_test = np.arange(80, 100)\n", + "score = estimator.score(idx_train)\n", "\n", - "score = estimator.score(idx_test)\n", + "print(\"The RMSE is %s kJ/mol\" % (str(score)) )\n", "\n", - "print(\"The RMSE is %d kJ/mol\" % (score) )\n", + "energies_predict = estimator.predict(idx_train)\n", "\n", - "energies_predict = estimator.predict(idx_test)" + "plt.scatter(energies, energies_predict)\n", + "plt.xlabel(\"True Energies (kJ/mol)\")\n", + "plt.ylabel(\"Predicted Energies (kJ/mol)\")\n", + "plt.show()" ] }, { @@ -668,27 +695,42 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The RMSE is 43 kJ/mol\n" + "The RMSE is 0.06417642021013359 kJ/mol\n" ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAEKCAYAAAAMzhLIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X2UXXV97/H3J8MgE4gOaKQkkBuIMS4xJdEppEa7RIQAVQheUShUil6RWygiNkKEFqyglIhY215bKFgqiPIQRqBADFSkpQaZMCHDg2l5iJAJQkACCFOYTL73j71PODM5Z8+emfM483mtNWvO/p19zv7OWZN8Z/8evj9FBGZmZuVMqncAZmbW2JwozMwskxOFmZllcqIwM7NMThRmZpbJicLMzDI5UZiZWSYnCjMzy+REYWZmmXaodwCV8La3vS1mzpxZ7zDMzJrK6tWrn4uIqcOdNy4SxcyZM+nq6qp3GGZmTUXSr/Kc564nMzPL5ERhZmaZnCjMzCyTE4WZmWVyojAzs0zjYtaTmdlE09ndy7IV69i4uY9p7W0sWTSHxfOnV+VaThRmZk2ms7uXpct76OsfAKB3cx9Ll/cAVCVZuOvJzKzJLFuxbluSKOjrH2DZinVVuZ4ThZlZk9m4uW9E7WPlRGFm1mSmtbeNqH2snCjMzJrMkkVzaGttGdTW1trCkkVzqnI9D2abmTWZwoC1Zz2ZmVlZi+dPr1piGMpdT2ZmlsmJwszMMjlRmJlZJicKMzPL5ERhZmaZnCjMzCyTE4WZmWVyojAzs0w1SRSSrpD0rKQHi9p2k7RS0n+n33dN2yXpO5IelbRW0ntrEaOZmZVWqzuKfwYOHdJ2FnBnRMwG7kyPAQ4DZqdfJwHfrVGMZmZWQk0SRUTcDfxmSPORwJXp4yuBxUXt/xKJVUC7pD1qEaeZmW2vnmMUu0fE0wDp97en7dOBp4rO25C2DSLpJEldkro2bdpU9WDNzCaqRhzMVom22K4h4tKI6IiIjqlTp9YgLDOziameieKZQpdS+v3ZtH0DsFfReXsCG2scm5mZpeqZKG4CTkgfnwD8uKj90+nspwXAi4UuKjMzq72a7Ech6RrgQ8DbJG0AzgUuBK6V9FngSeDo9PRbgcOBR4FXgRNrEaOZmZVWk0QREceWeeqgEucGcEp1IzIzs7wacTDbzMwaiBOFmZllcqIwM7NMThRmZpbJicLMzDI5UZiZWSYnCjMzy+REYWZmmZwozMwskxOFmZllcqIwM7NMw9Z6ktQBfBCYBvQBDwJ3RMTQHevMzBpKZ3cvy1asY+PmPqa1t7Fk0RwWz99uHzQbRtlEIelPgNOAJ4DVwDpgJ+ADwJmSHgT+IiKerEGcZmYjck5nD1evenLbrme9m/tYurwHwMlihLLuKHYGFkZEX6knJc0DZpOUCDczaxid3b2DkkRBX/8Ay1asc6IYobKJIiL+PuuFEbGm8uGYmY3dshXrtt8/ObVxc8m/fS1DVtfTd7JeGBGnVT4cM7ORKTUOkZUMprW31TC68SGr62l1zaIwMxuFzu5eli7voa9/AHhjHKJ9cisvvNq/3fkCliyaU+Mom19W19OVxceSpiTN8duqR2VmNozO7l6+dO0DDMTgTqa+/gHetMMk2lpbtiUQSJLEcQtmeHxiFIZdRyHpPZK6SabFPixptaR9x3phSXMkrSn6eknS6ZLOk9Rb1H74WK9lZuNL4U5iaJIoeLGvn298fC7T29sQML29jUs+NY/zF8+tbaDjRJ49sy8FzoiInwJI+hBwGfD+sVw4ItYB89L3bAF6gRuBE4FLIuKbY3l/Mxt/CuMRvcMMSE9rb2Px/Om+e6iQPIli50KSAIiIuyTtXOE4DgIei4hfSarwW5vZeDB0PKKcttYWj0NUWJ5E8bikvwC+nx4fT7IIr5KOAa4pOj5V0qeBLuBLEfFCha9nZk0i710EQIvENz4+13cSFZan1tNngKnAcpKuoakk3UMVIWlH4AjgurTpu8Askm6pp4GLy7zuJEldkro2bdpUqXDMrIGc09nDF3+0JleSaGtt4eJP7uckUQWKMoNBNQtAOhI4JSIOKfHcTOCWiHhP1nt0dHREV1dXdQI0s5oayR1EwXTXcRoVSasjomO48/IWBfwKMLP4/Ij43bEEWORYirqdJO0REU+nh0eRzLYyswnguMt+zj2P5a832tba4q6mGsgzRnE1sAToAbZW8uKSJgMHA58var4orSMVwPohz5nZONTZ3cuXr3+A1wfy93D4LqJ28iSKTRFxUzUuHhGvAm8d0vbH1biWmTWmzu5ellz/AP05k4SASz41zwmihvIkinMl/RNwJ/BaoTEillctKjMb15Kprmvp6x9ZJ4VXV9dHnkRxIvAuoJU3up6CZBaUmdmInNPZw1WrRr47gbua6idPotgvIrzu3czGrLO7d8RJYuGs3bj6c79fpYgsjzzrKFZJenfVIzGzcW/ZinUjOt9JojHkuaP4AHCCpCdIxihEUkW2UtNjzWyCyLtpUHtbK+cdsa+7mRpEnkRxaNWjMLNxaeimQuX2iSg4fsEMV3htQFk73HUB9wC3AXdFxP/ULCoza3qlNhVqnSRaJomBrdtPhV04azcniQaVdUexgKTb6VDgq5KeB1YAt0XEf9UiODNrLsV3EJOk7faL6N8atLe1IrHtzsLdTI0va4e7LcBd6ReS9gAOA86XNBv4eUT8aQ1iNLMG19ndy1eWr+XVonURWZsKPXHhH9YqNKuAPGMUAKT1l64ArpA0CfBUBDOjs7uXM65dQ4nepJKmtbdVNyCruKwxiptJFtaV8hrwmKQnI+KpqkRmZg2ts7uXM29Yy2tb8q+u9qZCzSnrjiJrK9IdgH2Ba/GdhdmEM5K7iBaJrRFM88rqppU1RvEzAEnvi4jVxc9J+lhEfEeS11KYTSAjrfIq8GZC40CeMYrLJJ0QET0Ako4FTgdujoj/U9XozKxhHPytu/jvZ18Z0WtcwG98yJMoPgFcL+k4kumynwa2243OzMav4y77+YiThBfPjR/DJoqIeFzSMUAn8BRwSETk36PQzJrWOZ09XHPvU2WnupayY4u46BPubhpPsmY99TB41tNuQAtwr6RKboVqZg3oXWffyv+MYMc5cBG/8SrrjuKjNYvCzBpGZ3cvX/zRmrJz40txghjfshLF8xHx26wXS9pluHPMrHkU6jM5SVixrETxY0lrgB8DqyPiFQBJ+wAHAp8ELgOuH0sAktYDLwMDwJaI6JC0G/AjYCawHvhkRLwwluuYWWmd3b2cd9NDbO4rX9W1FCeIiSNrHcVBkg4HPg8slLQrsAVYB/wrcEJE/LpCcRwYEc8VHZ8F3BkRF0o6Kz0+s0LXMrNUZ3cvS657gP689TfwbKaJKHPWU0TcCtxao1iKHQl8KH18JUlhQicKswoazd7VO7XISWICyl0UsIoC+ImkAP4xIi4Fdk+LEBIRT0t6e10jNBtHRrNwDmD3KTty79kHVyEia3SNkCgWRsTGNBmslPTLPC+SdBJwEsCMGTOqGZ/ZuHHABSt55uXXc58/3fWZjAZIFBGxMf3+rKQbgf2BZyTtkd5N7AE8W+J1lwKXAnR0dIxssrfZBDPSGk2tLWKZF81ZatJwJ0iaJelN6eMPSTpNUnslLi5pZ0lTCo9JSoM8CNwEnJCedgLJzCszG4VzOns4/UdrcieJXSe3OknYIHnuKG4AOiS9A7ic5D/xHwCHV+D6uwM3SirE8oOIuF3SfcC1kj4LPAkcXYFrmU04Ix2w9owmKyVPotgaEVskHQV8OyL+VlJ3JS4eEY8D+5Vofx44qBLXMJuoOrt7udpJwiogT6LoT0uLnwB8LG1rrV5IZjZaoyni59lMNpw8ieJE4GTggoh4QtLewFXVDcvMRmo00169utryyFNm/GFJZwIz0uMngAurHZiZ5TfS/SJEsqmQu5osj2EThaSPkeyfvSOwt6R5wF9FxBHVDs7MyhtNjSYnCBuNPF1P55GsbbgLICLWpN1PZlYnI63RJGCaF8/ZKOVJFFsi4sV0CmuBF7iZ1UFndy/LVqyjd3P+TSY9m8nGKk+ieFDSHwEtkmYDpwH/Wd2wzGyoczp7uHrVkyP6K23223d2krAxG3ZlNvBnwL7Aa8A1wEvA6dUMyswGK6yJGOmGQivP+FC1QrIJJM+sp1eBs9MvM6uDZSvW5U4S7W2tnHfEvh6LsIopmygkfTsiTpd0MyXGJDzryax6RtPNtOvkVs79mBOEVV7WHcX30+/frEUgZpYYSX0mAZd8ap6Tg1VV1laoq9PvP6tdOGZ2zb1P5TqvsCbCScKqLc+Cux6273p6EegCzk8L+JlZhQxXp8lrIqzW8kyPvQ0YICktDnAMye/qi8A/80ahQDOrgBapbLKY3t7GPWd9uMYR2USXJ1EsjIiFRcc9ku6JiIWSjq9WYGbjWWHh3MbNfdvdHRx7wF4lxyhaJokli+bUOlSzXOsodpF0QOFA0v7ALunhlqpEZTaOdXb3snR5D72b+wigd3MfS5f30NndC8D5i+dy/IIZFNdC2HnHFi4+2rvOWX0ohusPlTqA7/FGcngZ+CzwMPCHEXFtVSPMoaOjI7q6uuodhlkuCy/8t5IlONytZLUmaXVEdAx3XmbXk6RJwD4RMVfSW0gSy+aiU+qeJMyazcYydZrKtZvVW2bXU0RsBU5NH784JEmY2ShMa28bUbtZveUZo1gp6c8l7SVpt8LXWC+cvt9PJT0i6SFJX0jbz5PUK2lN+nX4WK9l1kiWLJpDW2vLoLa21hYPVFvDyjPr6TPp91OK2gLYZ4zX3gJ8KSLulzQFWC1pZfrcJRHhFeE2LhUGpMvNejJrNHmKAlZlk6KIeBp4On38sqRHAP9LsaaVNeV1qMXzpzsxWNMYtutJ0mRJ50i6ND2eLemjlQxC0kxgPnBv2nSqpLWSrpC0ayWvZVYNw015NWtmecYovge8Drw/Pd4AnF+pACTtAtwAnB4RLwHfBWYB80juOC4u87qTJHVJ6tq0aVOlwjEblWUr1tHXPzCora9/gGUr1tUpIrPKyZMoZkXERUA/QET0waC1QKMmqZUkSVwdEcvT938mIgbSGVeXkezXvZ2IuDQiOiKiY+rUqZUIx2zUPOXVxrM8g9mvS2ojLQwoaRbJbndjomQT7suBRyLiW0Xte6TjFwBHAQ+O9VpmldTZ3ct5Nz3E5r5+INkH4i1trduOi3nKq40HeRLFucDtwF6SrgYWAn9SgWsvBP6YpHbUmrTtK8CxkuaRJKb1wOcrcC2ziujs7mXJdQ/Qv/WNigYvvNpPyyTROkmD2j3l1caLPLOeVkq6H1hA0uX0hYh4bqwXjoj/oHQX1q1jfW+zalm2Yt2gZFAwsDV48+RWJu+4g6e82riT544CYCfghfT8d0siIu6uXlhmjSlrzGHzq/10/+UhNYzGrDbybFz018CngIeArWlzAE4UNuFMa28rWdCv8JzZeJTnjmIxMCcixjyAbdbsliyas90YBUBri/eKsPErz/TYx4HWagdi1gwWz5/OsqP3o73tjX8Su05uZdknvFeEjV957iheBdZIupOiabERcVrVojJrYC6/YRNNnkRxU/plZmYTUNlEIenNEfFSRFxZ4rkZ1Q3LzMwaRdYYxV2FB2m3U7HOqkRjZmYNJytRFC+GG7pRUUVqPZmZWePLShRR5nGpYzMzG6eyBrPfLukMkruHwmPSY5drNTObILISxWXAlBKPAf6pahGZmVlDKZsoIuKrtQzEzMwaU56V2WZmNoE5UZiZWSYnCjMzy5S1MvuMcs8BFG9famZm41fWrKfCLKc5wO/xRr2nj+G9KKwBnNPZwzX3PsVABC0Sxx6wF+cvnlvvsMzGnWFnPUn6CfDeiHg5PT4PuK4m0ZmVcU5nD1etenLb8UDEtmMnC7PKyjNGMQN4vej4dWBmVaIpIulQSeskPSrprGpfz5rLNfc+NaJ2Mxu9PGXGvw/8QtKNJKU7jgL+pZpBSWoB/h44GNgA3Cfppoh4uJrXteYxEKWryJRrN7PRGzZRRMQFkm4DPpg2nRgR3dUNi/2BRyPicQBJPwSOBJwoDIAWqWRSaJHrVZpVWt7psZOBlyLib4ANkvauYkwA04HiPoQNaZsZAMcesNeI2s1s9IZNFJLOBc4ElqZNrcBV1QyK0mXMB/35KOkkSV2SujZt2lTlcKzRnL94LscvmLHtDqJF4vgFMzyQbVYFecYojgLmA/cDRMRGSVOyXzJmG4DiPw33BDYWnxARlwKXAnR0dLhjepzo7O5l2Yp1bNzcx7T2NpYsmlN2f+rzF891YjCrgTyJ4vWICEkBIGnnKscEcB8wO+3i6gWOAf6oBte1Ours7mXp8h76+gcA6N3cx9LlPQBlk4WZVV+eMYprJf0j0C7pc8AdVLnMeERsAU4FVgCPANdGxEPVvKbV37IV67YliYK+/gGWrVhXp4jMDPLNevqmpIOBl0hWaf9lRKysdmARcStwa7WvY41j4+a+EbWbWW0Mmygk/XVEnAmsLNFmVjHT2tvoLZEUprW31SEaMyvI0/V0cIm2wyodiNmSRXNoa20Z1NbW2sKSRXPqFJGZQXb12P8L/CkwS9LaoqemAP9Z7cBsfMkzm6lwnHfWk5nVhqJMyQNJbwF2Bb4BFNdaejkiflOD2HLr6OiIrq6ueodhZZzT2cPVq54ctBCmrbWFb3x8rpOAWR1JWh0RHcOdV7brKSJejIj1wN8Av4mIX0XEr4B+SQdULlQbzzq7e7dLEuDZTGbNJM86iu8C7y06fqVEm9kgha6mUoPTBZ7NZNYc8iQKRVH/VERslZTndTZBDV04V45nM5k1hzyznh6XdJqk1vTrC8Dj1Q7MmlephXNDCTybyaxJ5EkUJwPvJymlsQE4ADipmkFZcxuuS0nAcQtmeCDbrEnkWZn9LEmtJbNcyi2cA5juKa9mTSdrHcWXI+IiSX8L201aISJOq2pk1rSWLJqz3RiFp8OaNa+sO4pH0u9eoGCDDLd4zgvnzMaXsgvumokX3NVOqRlNvlswa055F9xldT3dTIkup4KIOGKUsVkTyyoF7kRhNj5ldT19M/3+ceB3eGP702OB9VWMyRrI0G6mcoPUXjxnNn6VTRQR8TMASV+LiD8oeupmSXdXPTKru1I7zonSt5lePGc2fuVZYT1V0j4R8ThAuj3p1OqGZfWUVX4jYLtk4VLgZuNbnkTxReAuSYXV2DOBz1ctIqurPOU3gmQ9hGc0mU0MeRbc3S5pNvCutOmXEfFadcOyeslTfmN6exv3nPXhGkVkZvU2bAkPSZOBJcCpEfEAMEPSR8dyUUnLJP1S0lpJN0pqT9tnSuqTtCb9+oexXMdGbrhBaXczmU08eWo9fQ94Hfj99HgDcP4Yr7sSeE9E/C7wX8DSoucei4h56dfJY7yOjVDWoPT09javlzCbgPIkilkRcRHQDxARfSTjmaMWET+JiC3p4Spgz7G8n1VOuX2rv/2pedxz1oedJMwmoDyJ4nVJbaQTXSTNAio5RvEZ4Lai470ldUv6maQPVvA6lsPi+dP5xsfnMr29DeG7CDPLN+vpXOB2YC9JVwMLgT8Z7kWS7iBZqDfU2RHx4/Scs4EtwNXpc08DMyLieUnvAzol7RsRL5V4/5NIy53PmDEjx49hw9VoKlg8f7oTg5ltk1nrSZJIuoVeBRaQdDmtiojnxnxh6QSSvS4OiohXy5xzF/DnEZFZyMm1nobnGk1mNlTeWk+ZXU/pFqidEfF8RPxrRNxSoSRxKHAmcERxkpA0VVJL+ngfYDbeTa8ismo0mZllyTNGsUrS71X4un8HTAFWDpkG+wfAWkkPANcDJ0fEbyp87Qmp3LRX12gys+HkGaM4EDhZ0nrgFdIKDunU1lGJiHeUab8BuGG072vllSvo5xpNZjacPInisKpHYRVxTmcP19z7FAMRtEgce8BenL94LlB+1zkvnjOz4WTtR7ETyWDzO4Ae4PKitQ/WYI677Ofc89gbvXQDEVy16kkAzl8817vOmdmolZ31JOlHJIvs/p3kruJXEfGFGsaW20Se9dTZ3ct5Nz3E5r7+ks+3SDz2jcNrHJWZNYMx73AHvDsi5qZvdjnwi0oFZ5WRp9LrwDjY6tbM6itr1tO2P1Hd5dSY8lR6bdGYqq2YmWXeUewnqbAiWkBbelyY9fTmqkdng+TdlrTYsQfsVYPIzGw8y9oKtaXcc1Z7I9mWtGDhrN22zXoyMxutPNNjrY5Gui0pwK6TWzn3Y/t6RpOZVYQTRQPztqRm1gicKBqYtyU1s0aQp9aT1Ym3JTWzRuA7igZQbp+IrJlN093NZGY14kRRZ0NLb/Ru7uNL1z0AlK/P5D0kzKyW3PVUR+d09gxKEgUDW4Ozb+zxtqRm1hB8R1FH19z7VNnnXnk9uYvwtqRmVm++o6gj12Eys2bgO4oaKTVg3SKVTRau0GRmjcJ3FDVQWDjXu7mPIBmwXrq8hwX77Fr2NcctmFG7AM3MMjhR1ECphXN9/QOsf76P4xfM2O7u4fgFM1yjycwaRl26niSdB3wO2JQ2fSUibk2fWwp8FhgATouIFfWIsZLKLZzbuLmP8xfPdVIws4ZWzzGKSyLim8UNkt4NHAPsC0wD7pD0zojIrmPRQErtW11u4dy09rY6RGhmNjKN1vV0JPDDiHgtIp4AHgX2r3NMuR132c+5atWT2waoC/tWz3xrG22tg6u2u/yGmTWLeiaKUyWtlXSFpMKo7nSgeHHBhrRtO5JOktQlqWvTpk2lTqmpzu7ekovnAFY9/oIXzplZ06pa15OkO4DfKfHU2cB3ga+RVMn+GnAx8BlKzwotOX80Ii4FLgXo6Oio+4KEZSvWlX1uIMIL58ysaVUtUUTER/KcJ+ky4Jb0cANQvHfnnsDGCodWFVmVXr1vtZk1s7p0PUnao+jwKODB9PFNwDGS3iRpb2A28ItaxzcaWQPT3rfazJpZvcYoLpLUI2ktcCDwRYCIeAi4FngYuB04pVlmPC1ZNGe7AWvwvtVm1vzqMj02Iv4447kLgAtqGE5FFMYfSu0rYWbWzFzrqYI8YG1m45ETRRnldp0zM5tonCiG6Ozu5as3P8QLr/ZvaysU8QOcLMxswmm0ldl1VajyWpwkCvr6BzLXSpiZjVdOFEVKVXktlrVWwsxsvHKiKDJcInARPzObiCb0GMXQAev2ya0lu53ARfzMbOKasImiMB5R6Grq3dxH6yTR2iL6BwaXjmpva+W8I/b1QLaZTUgTNlGUGo/o3xq0t7Wy85t28LRYM7PUhE0U5cYjXuzrZ825h9Q4GjOzxjVhB7PLDUx7wNrMbLAJmyhKFfHzgLWZ2fYmbNeTi/iZmeUzYRMFuIifmVkeE7bryczM8nGiMDOzTE4UZmaWyYnCzMwyOVGYmVkmRcTwZzU4SZuAX9U7jhLeBjxX7yBycqzV0SyxNkuc4Fgr6X9FxNThThoXiaJRSeqKiI56x5GHY62OZom1WeIEx1oP7noyM7NMThRmZpbJiaK6Lq13ACPgWKujWWJtljjBsdacxyjMzCyT7yjMzCyTE0WFSTpPUq+kNenX4UXPLZX0qKR1khbVM840nmWSfilpraQbJbWn7TMl9RX9DP9Q71gBJB2afnaPSjqr3vEUk7SXpJ9KekTSQ5K+kLaX/X2oJ0nrJfWkMXWlbbtJWinpv9PvuzZAnHOKPrs1kl6SdHqjfK6SrpD0rKQHi9pKfo5KfCf9/V0r6b31iHk03PVUYZLOA34bEd8c0v5u4Bpgf2AacAfwzogY2O5NakTSIcC/RcQWSX8NEBFnSpoJ3BIR76lXbENJagH+CzgY2ADcBxwbEQ/XNbCUpD2APSLifklTgNXAYuCTlPh9qDdJ64GOiHiuqO0i4DcRcWGaiHeNiDPrFeNQ6e9AL3AAcCIN8LlK+gPgt8C/FP69lPsc02T2Z8DhJD/D30TEAfWKfSR8R1E7RwI/jIjXIuIJ4FGSpFE3EfGTiNiSHq4C9qxnPMPYH3g0Ih6PiNeBH5J8pg0hIp6OiPvTxy8DjwDNVsP+SODK9PGVJImukRwEPBYRDbO4NiLuBn4zpLnc53gkSUKJiFgFtKd/YDQ8J4rqODW9tbyi6PZ9OvBU0TkbaKz/SD4D3FZ0vLekbkk/k/TBegVVpNE/v23SO7L5wL1pU6nfh3oL4CeSVks6KW3bPSKehiTxAW+vW3SlHUNyV17QiJ8rlP8cm+Z3eCgnilGQdIekB0t8HQl8F5gFzAOeBi4uvKzEW1W932+YWAvnnA1sAa5Om54GZkTEfOAM4AeS3lztWIdRl89vpCTtAtwAnB4RL1H+96HeFkbEe4HDgFPSLpSGJWlH4AjgurSpUT/XLE3xO1zKhN7hbrQi4iN5zpN0GXBLergB2Kvo6T2BjRUObTvDxSrpBOCjwEGRDlhFxGvAa+nj1ZIeA94JdFU53Cx1+fxGQlIrSZK4OiKWA0TEM0XPF/8+1FVEbEy/PyvpRpKuvWck7RERT6ddIs/WNcjBDgPuL3yejfq5psp9jg3/O1yO7ygqbEif41FAYTbETcAxkt4kaW9gNvCLWsdXTNKhwJnAERHxalH71HTgEEn7kMT6eH2i3OY+YLakvdO/Lo8h+UwbgiQBlwOPRMS3itrL/T7UjaSd0wF3JO0MHEIS103ACelpJwA/rk+EJR1LUbdTI36uRcp9jjcBn05nPy0AXix0UTU6z3qqMEnfJ7kdDmA98PnCL0PaxfMZkm6e0yPitnLvUwuSHgXeBDyfNq2KiJMl/W/gr0jiHADOjYib6xTmNumskW8DLcAVEXFBnUPaRtIHgH8HeoCtafNXSP6DK/n7UC9p8r8xPdwB+EFEXCDprcC1wAzgSeDoiBg6UFtzkiaT9O3vExEvpm1l/53VOLZrgA+RVIl9BjgX6KTE55j+MfF3wKHAq8CJEVHPu/TcnCjMzCyTu57MzCyTE4WZmWVyojAzs0xOFGZmlsmJwszMMjlRWN1JemtRFdBfD6kKumMFr/MRSS9qcDXSAyv1/mWu2SLp3yv0Xn8n6f3p4w1Kq/0WPb+DpM1D2lZK+p0xXvcdktakj+dJ+qexvJ+oUg23AAAEKklEQVQ1H6/MtrqLiOdJ5sRnVd8VyXTurdu/w4j8NCIqWuxO0g5FxRUHSasDj7lWlqSpwPyIOHUEr9kZmBIRvx7r9QsiYo2kWZKmR0Rvpd7XGpvvKKxhpX/JPqhkP4z7gb2K/2KWdEzhr1tJu0taLqlL0i/Sla8jvc7lSvaSuE3STulzsyWtSIvn3S3pnWn7VZIulvRT4OuS3i7pTkn3S/p/6V1R+9C/8iWdlca3VtJfpm1T0ms+kMbxiRJhHs3goo2F95ss6SeSTizxmg8D/5aet0HSBZJWSbpP0nvT1z0m6XPpOZMkfSuNoadMHJCUy/hU3s/Xmp8ThTW6dwOXpwUKs/6C/Q5wUUR0kOwBUa575MAhXU8z0/Y5wLcjYl+gjzdKQ18K/GlEvA9YSrKytmAWSY2sL5OsZL89LbR3K8meI4OkK8tnkOxFMA94f9qVdDiwPiL2S/c0WFki7oUke1wUm0Lyn/aVEfG9Eq85DLi96Hh9RCwgKSl/OUnpi/cDX0ufP5rk896PZN+PSySVqiDbRQXukqx5uOvJGt1jEXFfjvM+AsxJeqgA2FVSW0T0DTlvu64nSe8g2euiJ21aDcxMxwAWADcUvW/xv5nrirrCPgBcABARt0h6uUSMh5D8592dHu9CUmzxXuBCSRcCN0fEPSVeuwewaUjbLcDXI+JHJc4njf20ouNCbaweYIeIeAV4RdJWJVVvP0BSzmMA+LWk/wA6SDaMKvYsJRKhjV9OFNboXil6vJXBpZp3KnosYP90U6PReK3o8QDJvw0Bz0XEvByxlSohPZSA8yPi8u2ekDpI7iyWSbolIr4+5JQ+Bv+8APcAh0m6tlD5t+j95gBPDBk7KfyMWxn8827ljZ83j53SeGyCcNeTNY30r/cX0nGDSSRdJwV3AKcUDiSV+899JNd7AXha0lHpe06StF+Z0/+DpMur0MU0pcQ5K4DPpoPMSNpT0tskTScZwP8+8C2g1F7KjwDvGNL2FZJk9Z0S5x9KiTGNYdxNUuG4RdLuJN1dpYrWvZPGqtZqVeZEYc3mTJJ+9ztJ6vsXnAIsTAeJHwY+V+b1Q8cojipzXsExwMmSHgAeItm7o5RzgT+UdD/JIPIzDL7jICJuBa4HVknqIakwugvJmMB96RTULwND7yYA/pWkSulQpwJvkfR1kruCwp3CoQwen8jjeuCXwAMkifeMiCi1J8WBaTw2Qbh6rFkFpLOktkTEFiUlx7+dDqxX6v1FctdyWLpzXqlz3gf8LUmiujsiKr4nu6Q24KckO+QNVPr9rTE5UZhVgKR3kWys00LyV/3JETF0ltJYr/H7wMsRsV23j6RTSO6qTouIOyp53SHXmUOyJ/Td1bqGNR4nCjMzy+QxCjMzy+REYWZmmZwozMwskxOFmZllcqIwM7NMThRmZpbp/wO17cH4QILV3wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "estimator = ARMP(hidden_layer_sizes=(40, 20, 10), scoring_function=\"rmse\")\n", + "estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, scoring_function=\"rmse\")\n", "\n", "estimator.fit(x=representation, y=energies, classes=zs)\n", "\n", "score = estimator.score(x=representation, y=energies, classes=zs)\n", "\n", - "print(\"The RMSE is %d kJ/mol\" % (score) )\n", + "print(\"The RMSE is %s kJ/mol\" % (str(score)) )\n", + "\n", + "energies_predict = estimator.predict(x=representation, classes=zs)\n", "\n", - "energies_predict = estimator.predict(x=representation, classes=zs)" + "plt.scatter(energies, energies_predict)\n", + "plt.xlabel(\"True Energies (kJ/mol)\")\n", + "plt.ylabel(\"Predicted Energies (kJ/mol)\")\n", + "plt.show()" ] }, { @@ -738,7 +780,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/examples/ARMP_1.py b/examples/ARMP_1.py index a97bb8872..5c47fb58e 100644 --- a/examples/ARMP_1.py +++ b/examples/ARMP_1.py @@ -39,8 +39,9 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = ARMP(iterations=10, representation='acsf', representation_params={"radial_rs": np.arange(0, 10, 1), "angular_rs": np.arange(0.5, 10.5, 1), -"theta_s": np.arange(0, 5, 1)}, tensorboard=False) +acsf_params = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} +estimator = ARMP(iterations=5000, representation_name='acsf', representation_params=acsf_params, tensorboard=False, + learning_rate=0.075, l1_reg=0.0, l2_reg=0.0) estimator.generate_compounds(filenames) estimator.set_properties(energies) diff --git a/examples/ARMP_2.py b/examples/ARMP_2.py index 34162b7e1..8d9e9b342 100644 --- a/examples/ARMP_2.py +++ b/examples/ARMP_2.py @@ -39,7 +39,7 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = ARMP(iterations=100, l2_reg=0.0) +estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, tensorboard=True, store_frequency=50) estimator.set_representations(representations=descriptor) estimator.set_classes(zs) diff --git a/examples/ARMP_3.py b/examples/ARMP_3.py index 2745cc562..c326b93cd 100644 --- a/examples/ARMP_3.py +++ b/examples/ARMP_3.py @@ -39,7 +39,7 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = ARMP(iterations=150, l2_reg=0.0, learning_rate=0.005, hidden_layer_sizes=(40, 20, 10)) +estimator = ARMP(iterations=3000, learning_rate=0.075, l1_reg=0.0, l2_reg=0.0, tensorboard=True, store_frequency=50) ## ------------- ** Fitting to the data ** --------------- diff --git a/examples/ARMP_qm7.py b/examples/ARMP_qm7.py new file mode 100644 index 000000000..ba9a2cea9 --- /dev/null +++ b/examples/ARMP_qm7.py @@ -0,0 +1,41 @@ +""" +This example shows how to use ARMP to overfit 100 data-points for the QM7 data set. It uses the Atom Centred Symmetry +functions as the representation. + +This example takes about 3.5 min to run on a mac. +""" + +from qml.aglaia.aglaia import ARMP +import glob +import numpy as np +import matplotlib.pyplot as plt +from sklearn import model_selection as modsel + +filenames = sorted(glob.glob("../test/qm7/*.xyz")) +energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=[1]) +n_samples = len(filenames) +print("%i files were loaded." % (n_samples)) + +acsf_params = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} +estimator = ARMP(iterations=6000, representation_name='acsf', representation_params=acsf_params, l1_reg=0.0, l2_reg=0.0, + scoring_function="rmse", tensorboard=False, store_frequency=10, learning_rate=0.075) + +estimator.set_properties(energies[:100]) +estimator.generate_compounds(filenames[:100]) +estimator.generate_representation(method="fortran") +print("The shape of the representation is: %s" % (str(estimator.representation.shape))) + +idx = list(range(100)) + +idx_train, idx_test = modsel.train_test_split(idx, test_size=0, random_state=42, shuffle=True) + +estimator.fit(idx_train) + +score = estimator.score(idx_train) +print("The RMSE is %s kcal/mol." % (str(score))) + +ene_pred = estimator.predict(idx_train) + +# Plotting the predictions against the true values +plt.scatter(energies[idx_train], ene_pred) +plt.show() \ No newline at end of file diff --git a/examples/MRMP_1.py b/examples/MRMP_1.py index f91b9daeb..74449e87f 100644 --- a/examples/MRMP_1.py +++ b/examples/MRMP_1.py @@ -41,7 +41,7 @@ ## ------------- ** Setting up the estimator ** --------------- -estimator = MRMP(representation='slatm', representation_params={'slatm_dgrid2': 0.06, 'slatm_dgrid1': 0.06}) +estimator = MRMP(representation_name='slatm', representation_params={'slatm_dgrid2': 0.06, 'slatm_dgrid1': 0.06}) estimator.generate_compounds(filenames[:100]) estimator.set_properties(energies[:100]) diff --git a/examples/qmlearn.py b/examples/qmlearn.py index 84af9e7be..135ea4628 100644 --- a/examples/qmlearn.py +++ b/examples/qmlearn.py @@ -213,6 +213,64 @@ def pipelines(): print("*** End pipelines examples ***") print() +def pipelines_2(): + """ + Scikit learn pipeline with a molecular neural network + """ + + print("\n *** Begin pipelines example with molecular Neural Network ***") + + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + data.set_energies(energies) + + # Create model + model = sklearn.pipeline.make_pipeline( + qmlearn.preprocessing.AtomScaler(data), + qmlearn.representations.CoulombMatrix(), + qmlearn.models.NeuralNetwork(iterations=500, batch_size=50, learning_rate=0.005), + ) + + indices = np.arange(1000) + np.random.shuffle(indices) + + model.fit(indices[:100]) + + # Score on the TRAINING set, since you won't get good predictions in 500 iterations + scores = model.score(indices[:100]) + print("Negative MAE:", scores) + + print("*** End pipelines example with molecular Neural Network *** \n") + +def pipelines_3(): + """ + Scikit learn pipeline with an atomic neural network + """ + + print("\n *** Begin pipelines example with atomic Neural Network ***") + + data = qmlearn.Data("../test/qm7/*.xyz") + energies = np.loadtxt("../test/data/hof_qm7.txt", usecols=1) + data.set_energies(energies) + + # Create model + model = sklearn.pipeline.make_pipeline( + qmlearn.preprocessing.AtomScaler(data), + qmlearn.representations.AtomCenteredSymmetryFunctions(), + qmlearn.models.NeuralNetwork(iterations=500, batch_size=50, learning_rate=0.005), + ) + + indices = np.arange(1000) + np.random.shuffle(indices) + + model.fit(indices[:100]) + + # Score on the TRAINING set, since you won't get good predictions in 500 iterations + scores = model.score(indices[:100]) + print("Negative MAE:", scores) + + print("*** End pipelines example with atomic Neural Network *** \n") + def cross_validation(): """ Doing cross validation with qmlearn @@ -285,3 +343,5 @@ def cross_validation(): models() pipelines() cross_validation() + pipelines_2() + pipelines_3() diff --git a/qml/aglaia/aglaia.py b/qml/aglaia/aglaia.py index 8a7808cfc..a7d22c386 100644 --- a/qml/aglaia/aglaia.py +++ b/qml/aglaia/aglaia.py @@ -30,24 +30,20 @@ import tensorflow as tf from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error from sklearn.base import BaseEstimator -from .symm_funct import generate_parkhill_acsf -from ..utils import InputError, ceil, is_positive_or_zero, is_positive_integer, is_positive, \ +from .symm_funct import generate_acsf_tf +from ..utils.utils import InputError, ceil, is_positive_or_zero, is_positive_integer, is_positive, \ is_bool, is_positive_integer_or_zero, is_string, is_positive_integer_array, is_array_like, \ check_global_representation, check_y, check_sizes, check_dy, check_classes, is_numeric_array, is_non_zero_integer, \ is_positive_integer_or_zero_array, check_local_representation -from .tf_utils import TensorBoardLogger +from qml.aglaia.tf_utils import TensorBoardLogger +from qml.representations import generate_acsf +from qml.aglaia.graceful_killer import _GracefulKiller -try: - from qml.data import Compound - from qml import representations as qml_rep -except ImportError: - raise ImportError("The module qml is required") +from qml.data import Compound +from qml import representations as qml_rep -try: - import tensorflow -except ImportError: - raise ImportError("Tensorflow 1.8 is required to run neural networks.") +import tensorflow class _NN(BaseEstimator): @@ -141,6 +137,9 @@ def __init__(self, hidden_layer_sizes, l1_reg, l2_reg, batch_size, learning_rate self.gradients = None self.classes = None + # To enable restart model + self.loaded_model = False + def _set_activation_function(self, activation_function): """ This function sets which activation function will be used in the model. @@ -594,8 +593,8 @@ def _set_acsf_parameters(self, params): :return: None """ - self.acsf_parameters = {'radial_cutoff': 10.0, 'angular_cutoff': 10.0, 'radial_rs': (0.0, 0.1, 0.2), - 'angular_rs': (0.0, 0.1, 0.2), 'theta_s': (3.0, 2.0), 'zeta': 3.0, 'eta': 2.0} + self.acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, + 'zeta': 220.127, 'eta': 30.8065} if params is not None: for key, value in params.items(): @@ -671,7 +670,7 @@ def generate_compounds(self, filenames): for i, filename in enumerate(filenames): self.compounds[i] = Compound(filename) - def generate_representation(self, xyz=None, classes=None): + def generate_representation(self, xyz=None, classes=None, method='fortran'): """ This function can generate representations either from the data contained in the compounds or from xyz data passed through the argument. If the Compounds have already being set and xyz data is given, it complains. @@ -692,12 +691,12 @@ def generate_representation(self, xyz=None, classes=None): if self.compounds is None: - self.representation, self.classes = self._generate_representations_from_data(xyz, classes) + self.representation, self.classes = self._generate_representations_from_data(xyz, classes, method) elif xyz is None: # Make representations from compounds - self.representation, self.classes = self._generate_representations_from_compounds() + self.representation, self.classes = self._generate_representations_from_compounds(method) else: raise InputError("Compounds have already been set but new xyz data is being passed.") @@ -765,10 +764,10 @@ def set_classes(self, classes): if classes is None: raise InputError("Classes cannot be set to none.") else: - if is_positive_integer_array(classes): + if is_positive_integer_or_zero_array(classes): self.classes = np.asarray(classes) else: - raise InputError('Variable "gradients" expected to be array like of positive integers.') + raise InputError('Variable "classes" expected to be array like of positive integers.') def fit(self, x, y=None, dy=None, classes=None): """ @@ -820,29 +819,25 @@ def _check_acsf_values(self): :return: None """ - if not is_positive(self.acsf_parameters['radial_cutoff']): - raise InputError("Expected positive float for variable 'radial_cutoff'. Got %s." % str(self.acsf_parameters['radial_cutoff'])) + if not is_positive(self.acsf_parameters['rcut']): + raise InputError( + "Expected positive float for variable 'rcut'. Got %s." % str(self.acsf_parameters['rcut'])) - if not is_positive(self.acsf_parameters['angular_cutoff']): - raise InputError("Expected positive float for variable 'angular_cutoff'. Got %s." % str(self.acsf_parameters['angular_cutoff'])) + if not is_positive(self.acsf_parameters['acut']): + raise InputError( + "Expected positive float for variable 'acut'. Got %s." % str(self.acsf_parameters['acut'])) - if not is_numeric_array(self.acsf_parameters['radial_rs']): - raise InputError("Expecting an array like radial_rs. Got %s." % (self.acsf_parameters['radial_rs']) ) - if not len(self.acsf_parameters['radial_rs'])>0: - raise InputError("No radial_rs values were given." ) + if not is_positive_integer(self.acsf_parameters['nRs2']): + raise InputError("Expected positinve integer for 'nRs2. Got %s." % (self.acsf_parameters['nRs2'])) - if not is_numeric_array(self.acsf_parameters['angular_rs']): - raise InputError("Expecting an array like angular_rs. Got %s." % (self.acsf_parameters['angular_rs']) ) - if not len(self.acsf_parameters['angular_rs'])>0: - raise InputError("No angular_rs values were given." ) + if not is_positive_integer(self.acsf_parameters['nRs3']): + raise InputError("Expected positinve integer for 'nRs3. Got %s." % (self.acsf_parameters['nRs3'])) - if not is_numeric_array(self.acsf_parameters['theta_s']): - raise InputError("Expecting an array like theta_s. Got %s." % (self.acsf_parameters['theta_s']) ) - if not len(self.acsf_parameters['theta_s'])>0: - raise InputError("No theta_s values were given. " ) + if not is_positive_integer(self.acsf_parameters['nTs']): + raise InputError("Expected positinve integer for 'nTs. Got %s." % (self.acsf_parameters['nTs'])) - if is_numeric_array(self.acsf_parameters['eta']): - raise InputError("Expecting a scalar value for eta. Got %s." % (self.acsf_parameters['eta'])) + if is_numeric_array(self.acsf_parameters['eta']) or is_numeric_array(self.acsf_parameters['eta']): + raise InputError("Expecting a scalar value for eta parameters.") if is_numeric_array(self.acsf_parameters['zeta']): raise InputError("Expecting a scalar value for zeta. Got %s." % (self.acsf_parameters['zeta'])) @@ -1017,7 +1012,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat activation_function = tf.sigmoid, optimiser = tf.train.AdamOptimizer, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-08, rho = 0.95, initial_accumulator_value = 0.1, initial_gradient_squared_accumulator_value = 0.1, l1_regularization_strength = 0.0, l2_regularization_strength = 0.0, - tensorboard_subdir = os.getcwd() + '/tensorboard', representation='unsorted_coulomb_matrix', representation_params=None): + tensorboard_subdir = os.getcwd() + '/tensorboard', representation_name='unsorted_coulomb_matrix', representation_params=None): """ Descriptors is used as input to a single or multi layered feed-forward neural network with a single output. This class inherits from the _NN class and all inputs not unique to the NN class is passed to the _NN @@ -1031,7 +1026,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat rho, initial_accumulator_value, initial_gradient_squared_accumulator_value, l1_regularization_strength,l2_regularization_strength, tensorboard_subdir) - self._initialise_representation(representation, representation_params) + self._initialise_representation(representation_name, representation_params) def _initialise_representation(self, representation, parameters): """ @@ -1077,7 +1072,7 @@ def _set_representation(self, representation): self.representation = representation - def _generate_representations_from_data(self, xyz, classes): + def _generate_representations_from_data(self, xyz, classes, method): """ This function makes the representation from xyz data and nuclear charges. @@ -1085,15 +1080,19 @@ def _generate_representations_from_data(self, xyz, classes): :type xyz: numpy array of shape (n_samples, n_atoms, 3) :param classes: classes for atomic decomposition :type classes: None + :param method: What method to use + :type method: string :return: numpy array of shape (n_samples, n_features) and None """ # TODO implement raise InputError("Not implemented yet. Use compounds.") - def _generate_representations_from_compounds(self): + def _generate_representations_from_compounds(self, method): """ This function generates the representations from the compounds. + :param method: flag that tells whether to use fortran or tensrflow implementation when there is a choice (here there is only fortran) + :type method: string :return: the representation and None (in the ARMP class this would be the classes for atomic decomposition) :rtype: numpy array of shape (n_samples, n_features) and None """ @@ -1554,7 +1553,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat activation_function = tf.sigmoid, optimiser = tf.train.AdamOptimizer, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-08, rho = 0.95, initial_accumulator_value = 0.1, initial_gradient_squared_accumulator_value = 0.1, l1_regularization_strength = 0.0, l2_regularization_strength = 0.0, - tensorboard_subdir = os.getcwd() + '/tensorboard', representation='acsf', representation_params=None): + tensorboard_subdir = os.getcwd() + '/tensorboard', representation_name='acsf', representation_params=None): """ To see what parameters are required, look at the description of the _NN class init. This class inherits from the _NN class and all inputs not unique to the ARMP class are passed to the _NN @@ -1567,7 +1566,7 @@ def __init__(self, hidden_layer_sizes = (5,), l1_reg = 0.0, l2_reg = 0.0001, bat rho, initial_accumulator_value, initial_gradient_squared_accumulator_value, l1_regularization_strength,l2_regularization_strength, tensorboard_subdir) - self._initialise_representation(representation, representation_params) + self._initialise_representation(representation_name, representation_params) def _initialise_representation(self, representation, parameters): """ @@ -1583,7 +1582,7 @@ def _initialise_representation(self, representation, parameters): if not is_string(representation): raise InputError("Expected string for variable 'representation'. Got %s" % str(representation)) if representation.lower() not in ['slatm', 'acsf']: - raise InputError("Unknown representation %s" % representation) + raise InputError("Representation %s not implemented." % representation) self.representation_name = representation.lower() if parameters is not None: @@ -1612,7 +1611,7 @@ def _set_representation(self, representation): self.representation = representation - def _generate_representations_from_data(self, xyz, classes): + def _generate_representations_from_data(self, xyz, classes, method): """ This function generates the representations from xyz data @@ -1620,11 +1619,16 @@ def _generate_representations_from_data(self, xyz, classes): :type xyz: numpy array of shape (n_samples, n_atoms, 3) :param classes: classes to use for atomic decomposition :type classes: numpy array of shape (n_samples, n_atoms) + :param method: What method to use + :type method: string :return: representations and classes :rtype: numpy arrays of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms) """ - if classes is None: + if method not in ['fortran', 'tf']: + raise InputError("The method to generate the acsf can only be 'fortran' or 'tf'. Got %s." % (method)) + + if isinstance(classes, type(None)): raise InputError("The classes need to be provided for the ARMP estimator.") else: if len(classes.shape) > 2 or np.all(xyz.shape[:2] != classes.shape): @@ -1638,13 +1642,18 @@ def _generate_representations_from_data(self, xyz, classes): raise InputError("Slatm from data has not been implemented yet. Use Compounds.") elif self.representation_name == 'acsf': + if method == 'tf': + representation = self._generate_acsf_tf(xyz, classes) + else: + representation = self._generate_acsf_fortran(xyz, classes) - representation = self._generate_acsf_from_data(xyz, classes) + # Hotfix t make sure the representation is single precision + single_precision_representation = representation.astype(dtype=np.float32) + del representation - return representation, classes + return single_precision_representation, classes - # TODO modify so that it uses the new map function - def _generate_acsf_from_data(self, xyz, classes): + def _generate_acsf_tf(self, xyz, classes): """ This function generates the acsf from the cartesian coordinates and the classes. @@ -1655,27 +1664,17 @@ def _generate_acsf_from_data(self, xyz, classes): :return: representation acsf :rtype: numpy array of shape (n_samples, n_atoms, n_features) """ - mbtypes = qml_rep.get_slatm_mbtypes([classes[i] for i in range(classes.shape[0])]) - elements = [] - element_pairs = [] - - # Splitting the one and two body interactions in mbtypes - for item in mbtypes: - if len(item) == 1: - elements.append(item[0]) - if len(item) == 2: - element_pairs.append(list(item)) - if len(item) == 3: - break + if 0 in classes: + idx_zeros = np.where(classes == 0)[1] + classes_for_elements = classes[:, :idx_zeros[0]] + else: + classes_for_elements = classes - # Need the element pairs in descending order for TF - for item in element_pairs: - item.reverse() + elements, element_pairs = self._get_elements_and_pairs(classes_for_elements) if self.tensorboard: - run_metadata = tf.RunMetadata() - options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + self.tensorboard_logger_representation.initialise() n_samples = xyz.shape[0] max_n_atoms = xyz.shape[1] @@ -1690,13 +1689,13 @@ def _generate_acsf_from_data(self, xyz, classes): iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) batch_xyz, batch_zs = iterator.get_next() - representation = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, - radial_cutoff=self.acsf_parameters['radial_cutoff'], - angular_cutoff=self.acsf_parameters['angular_cutoff'], - radial_rs=self.acsf_parameters['radial_rs'], - angular_rs=self.acsf_parameters['angular_rs'], - theta_s=self.acsf_parameters['theta_s'], eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) + representation = generate_acsf_tf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], eta=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) sess = tf.Session() sess.run(tf.global_variables_initializer()) @@ -1705,14 +1704,14 @@ def _generate_acsf_from_data(self, xyz, classes): representation_slices = [] if self.tensorboard: - summary_writer = tf.summary.FileWriter(logdir="tensorboard", graph=sess.graph) + self.tensorboard_logger_representation.set_summary_writer(sess) batch_counter = 0 while True: try: - representation_np = sess.run(representation, options=options, run_metadata=run_metadata) - summary_writer.add_run_metadata(run_metadata=run_metadata, tag="batch %s" % batch_counter, - global_step=None) + representation_np = sess.run(representation, options=self.tensorboard_logger_representation.options, + run_metadata=self.tensorboard_logger_representation.run_metadata) + self.tensorboard_logger_representation.write_metadata(batch_counter) representation_slices.append(representation_np) batch_counter += 1 except tf.errors.OutOfRangeError: @@ -1726,17 +1725,70 @@ def _generate_acsf_from_data(self, xyz, classes): representation_slices.append(representation_np) batch_counter += 1 except tf.errors.OutOfRangeError: - print("Generated all the representations.") break representation_conc = np.concatenate(representation_slices, axis=0) - print("The representation has shape %s." % (str(representation_conc.shape))) sess.close() return representation_conc - def _generate_representations_from_compounds(self): + def _generate_acsf_fortran(self, xyz, classes): + """ + This function uses fortran to generate the representation and the derivative of the representation with respect + to the cartesian coordinates. + :param xyz: cartesian coordinates + :type xyz: numpy array of shape (n_samples, n_atoms, 3) + :param classes: the atom types + :type classes: numpy array of shape (n_samples, n_atoms) + :return: representations and their derivatives wrt to xyz + :rtype: numpy array of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms, n_features, n_atoms, 3) + """ + + initial_natoms = xyz.shape[1] + + elements, _ = self._get_elements_and_pairs(classes) + + representation = [] + + for i in range(xyz.shape[0]): + if 0 in classes[i]: + idx_zeros = np.where(classes[i] == 0)[0] + mol_xyz = xyz[i, :idx_zeros[0], :] + mol_classes = classes[i, :idx_zeros[0]] + + g = generate_acsf(coordinates=mol_xyz, elements=elements, gradients=False, nuclear_charges=mol_classes, + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta2=self.acsf_parameters['eta'], + eta3=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) + + padded_g = np.zeros((initial_natoms, g.shape[-1])) + padded_g[:g.shape[0], :] = g + + representation.append(padded_g) + + else: + + g = generate_acsf(coordinates=xyz[i], elements=elements, gradients=False, nuclear_charges=classes[i], + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta2=self.acsf_parameters['eta'], + eta3=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) + + representation.append(g) + + return np.asarray(representation) + + def _generate_representations_from_compounds(self, method): """ This function generates the representations from the compounds. :return: the representations and the classes @@ -1752,41 +1804,30 @@ def _generate_representations_from_compounds(self): elif self.representation_name == 'acsf': - representations, classes = self._generate_acsf_from_compounds() + xyz, classes = self._extract_and_pad() + if method == "tf": + representations = self._generate_acsf_tf(xyz, classes) + elif method == "fortran": + representations = self._generate_acsf_fortran(xyz, classes) + else: + raise InputError("Method not recognised.") else: raise InputError("This should never happen, unrecognised representation %s." % (self.representation_name)) - return representations, classes + # Hotfix t make sure the representation is single precision + single_precision_representation = representations.astype(dtype=np.float32) + del representations - def _generate_acsf_from_compounds(self): - """ - This function generates the atom centred symmetry functions. + return single_precision_representation, classes - :return: representation acsf and classes - :rtype: numpy array of shape (n_samples, n_atoms, n_features) and (n_samples, n_atoms) + def _extract_and_pad(self): """ - - # Obtaining the total elements and the element pairs - mbtypes = qml_rep.get_slatm_mbtypes([mol.nuclear_charges for mol in self.compounds]) - - elements = [] - element_pairs = [] - - # Splitting the one and two body interactions in mbtypes - for item in mbtypes: - if len(item) == 1: - elements.append(item[0]) - if len(item) == 2: - element_pairs.append(list(item)) - if len(item) == 3: - break - - # Need the element pairs in descending order for TF - for item in element_pairs: - item.reverse() - - # Obtaining the xyz and the nuclear charges + This function extracts cartesian coordinates and nuclear charges for the compounds and pads them. + :return: xyz and zs padded + :rtype: numpy arrays of shape (n_samples, max_n_atoms, 3) and (n_samples, max_n_atoms) + """ + # Obtaining the xyz and the nuclear charges from the compounds xyzs = [] zs = [] max_n_atoms = 0 @@ -1797,77 +1838,18 @@ def _generate_acsf_from_compounds(self): if len(compound.nuclear_charges) > max_n_atoms: max_n_atoms = len(compound.nuclear_charges) - # Padding so that all the samples have the same shape - n_samples = len(zs) - for i in range(n_samples): - current_n_atoms = len(zs[i]) - missing_n_atoms = max_n_atoms - current_n_atoms - zs_padding = np.zeros(missing_n_atoms) - zs[i] = np.concatenate((zs[i], zs_padding)) - xyz_padding = np.zeros((missing_n_atoms, 3)) - xyzs[i] = np.concatenate((xyzs[i], xyz_padding)) - - zs = np.asarray(zs, dtype=np.int32) - xyzs = np.asarray(xyzs, dtype=np.float32) - - if self.tensorboard: - self.tensorboard_logger_representation.initialise() - # run_metadata = tf.RunMetadata() - # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - - # Turning the quantities into tensors - with tf.name_scope("Inputs"): - zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") - xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - - dataset = tf.data.Dataset.from_tensor_slices((xyz_tf, zs_tf)) - dataset = dataset.batch(20) - iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) - batch_xyz, batch_zs = iterator.get_next() - - representations = generate_parkhill_acsf(xyzs=batch_xyz, Zs=batch_zs, elements=elements, element_pairs=element_pairs, - radial_cutoff=self.acsf_parameters['radial_cutoff'], - angular_cutoff=self.acsf_parameters['angular_cutoff'], - radial_rs=self.acsf_parameters['radial_rs'], - angular_rs=self.acsf_parameters['angular_rs'], - theta_s=self.acsf_parameters['theta_s'], eta=self.acsf_parameters['eta'], - zeta=self.acsf_parameters['zeta']) - - sess = tf.Session() - sess.run(tf.global_variables_initializer()) - sess.run(iterator.make_initializer(dataset), feed_dict={xyz_tf: xyzs, zs_tf: zs}) - - representations_slices = [] - - if self.tensorboard: - self.tensorboard_logger_representation.set_summary_writer(sess) - - batch_counter = 0 - while True: - try: - representations_np = sess.run(representations, options=self.tensorboard_logger_representation.options, - run_metadata=self.tensorboard_logger_representation.run_metadata) - self.tensorboard_logger_representation.write_metadata(batch_counter) - - representations_slices.append(representations_np) - batch_counter += 1 - except tf.errors.OutOfRangeError: - break - else: - batch_counter = 0 - while True: - try: - representations_np = sess.run(representations) - representations_slices.append(representations_np) - batch_counter += 1 - except tf.errors.OutOfRangeError: - break + n_samples = len(xyzs) - representation_conc = np.concatenate(representations_slices, axis=0) + # Padding the xyz and the nuclear charges + padded_xyz = np.zeros((n_samples, max_n_atoms, 3)) + padded_zs = np.zeros((n_samples, max_n_atoms)) - sess.close() + for i in range(n_samples): + current_n_atoms = xyzs[i].shape[0] + padded_xyz[i, :current_n_atoms, :] = xyzs[i] + padded_zs[i, :current_n_atoms] = zs[i] - return representation_conc, zs + return padded_xyz, padded_zs def _generate_slatm_from_compounds(self): """ @@ -1955,26 +1937,26 @@ def _model(self, x, zs, element_weights, element_biases): :rtype: tf tensor of shape (n_samples, 1) """ - atomic_energies = tf.zeros_like(zs) - - for i in range(self.elements.shape[0]): + all_atomic_energies = tf.zeros_like(zs, dtype=tf.float32) - # Calculating the output for every atom in all data as if they were all of the same element - atomic_energies_all = self._atomic_model(x, self.hidden_layer_sizes, element_weights[self.elements[i]], - element_biases[self.elements[i]]) # (n_samples, n_atoms) + for el in self.elements: + # Obtaining the indices of where in Zs there is the current element + current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) + where_element = tf.cast(tf.where(tf.equal(zs, current_element)), dtype=tf.int32) - # Figuring out which atomic energies correspond to the current element. - current_element = tf.expand_dims(tf.constant(self.elements[i], dtype=tf.int32), axis=0) - where_element = tf.equal(tf.cast(zs, dtype=tf.int32), current_element) # (n_samples, n_atoms) + # Extract the descriptor corresponding to the right element + current_element_in_x = tf.gather_nd(x, where_element) - # Extracting the energies corresponding to the right element - element_energies = tf.where(where_element, atomic_energies_all, tf.zeros_like(zs)) + # Calculate the atomic energy of all the atoms of type equal to the current element + atomic_ene = self._atomic_model(current_element_in_x, self.hidden_layer_sizes, element_weights[el], + element_biases[el]) - # Adding the energies of the current element to the final atomic energies tensor - atomic_energies = tf.add(atomic_energies, element_energies) + # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies + updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(zs)) + all_atomic_energies = tf.add(all_atomic_energies, updates) # Summing the energies of all the atoms - total_energies = tf.reduce_sum(atomic_energies, axis=-1, name="output", keepdims=True) + total_energies = tf.reduce_sum(all_atomic_energies, axis=-1, name="output", keepdims=True) return total_energies @@ -2094,7 +2076,8 @@ def _check_predict_input(self, x, classes): raise InputError("No representations or QML compounds have been set yet.") else: self.representation, self.classes = self._generate_representations_from_compounds() - if self.properties is None: + + if isinstance(self.properties, type(None)): raise InputError("The properties need to be set in advance.") approved_x = self.representation[x] @@ -2104,7 +2087,7 @@ def _check_predict_input(self, x, classes): else: - if classes is None: + if isinstance(classes, type(None)): raise InputError("ARMP estimator needs the classes to do atomic decomposition.") approved_x = check_local_representation(x) @@ -2137,8 +2120,8 @@ def _check_representation_parameters(self, parameters): elif self.representation_name == "acsf": - acsf_parameters = {'radial_cutoff': 10.0, 'angular_cutoff': 10.0, 'radial_rs': (0.0, 0.1, 0.2), - 'angular_rs': (0.0, 0.1, 0.2), 'theta_s': (3.0, 2.0), 'zeta': 3.0, 'eta': 2.0} + acsf_parameters = {'rcut': 5.0, 'acut': 5.0, 'nRs2': 5, 'nRs3': 5, 'nTs': 5, + 'zeta': 220.127, 'eta': 30.8065} for key, value in parameters.items(): try: @@ -2146,6 +2129,25 @@ def _check_representation_parameters(self, parameters): except Exception: raise InputError("Unrecognised parameter for acsf representation: %s" % (key)) + def _get_elements_and_pairs(self, classes): + """ + This function generates the atom centred symmetry functions. + :param classes: The different types of atoms present in the system + :type classes: numpy array of shape (n_samples, n_atoms) + :return: elements and element pairs in the system + :rtype: numpy array of shape (n_elements,) and (n_element_pairs) + """ + + elements = np.unique(classes) + elements_no_zero = np.ma.masked_equal(elements,0).compressed() + + element_pairs = [] + for i, ei in enumerate(elements_no_zero): + for ej in elements_no_zero[i:]: + element_pairs.append([ej, ei]) + + return np.asarray(elements_no_zero), np.asarray(element_pairs) + def _find_elements(self, zs): """ This function finds the unique atomic numbers in Zs and returns them in a list. @@ -2163,6 +2165,27 @@ def _find_elements(self, zs): return np.trim_zeros(elements) def _fit(self, x, y, dy, classes): + """ + This function calls either fit_from_scratch or fit_from_loaded depending on if a model has been loaded or not. + + :param x: either the representations or the indices to the data points to use + :type x: either a numpy array of shape (n_samples, n_atoms, n_features) or a numpy array of ints + :param y: either the properties or None + :type y: either a numpy array of shape (n_samples,) or None + :param dy: None + :type dy: None + :param classes: classes to use for the atomic decomposition or None + :type classes: either a numpy array of shape (n_samples, n_atoms) or None + + :return: None + """ + + if not self.loaded_model: + self._fit_from_scratch(x, y, dy, classes) + else: + self._fit_from_loaded(x, y, dy, classes) + + def _fit_from_scratch(self, x, y, dy, classes): """ This function fits an atomic decomposed network to the data. @@ -2180,8 +2203,10 @@ def _fit(self, x, y, dy, classes): x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) - # Obtaining the array of unique elements in all samples - self.elements = self._find_elements(classes_approved) + # Putting a mask on all the 0 values + classes_for_elements = np.ma.masked_equal(classes_approved, 0).compressed() + + self.elements, self.element_pairs = self._get_elements_and_pairs(classes_for_elements) if self.tensorboard: self.tensorboard_logger_training.initialise() @@ -2201,20 +2226,18 @@ def _fit(self, x, y, dy, classes): # Initial set up of the NN with tf.name_scope("Data"): - x_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms, self.n_features]) - zs_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms]) - y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1]) + x_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, self.n_atoms, self.n_features], name="Descriptors") + zs_ph = tf.placeholder(dtype=tf.int32, shape=[None, self.n_atoms], name="Atomic-numbers") + y_ph = tf.placeholder(dtype=self.tf_dtype, shape=[None, 1], name="Properties") + buffer_tf = tf.placeholder(dtype=tf.int64, name="buffer") dataset = tf.data.Dataset.from_tensor_slices((x_ph, zs_ph, y_ph)) - dataset = dataset.shuffle(buffer_size=self.n_samples) + dataset = dataset.shuffle(buffer_size=buffer_tf) dataset = dataset.batch(batch_size) # batched_dataset = dataset.prefetch(buffer_size=batch_size) iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) tf_x, tf_zs, tf_y = iterator.get_next() - tf_x = tf.identity(tf_x, name="Descriptors") - tf_zs = tf.identity(tf_zs, name="Atomic-numbers") - tf_y = tf.identity(tf_y, name="Properties") # Creating dictionaries of the weights and biases for each element element_weights = {} @@ -2240,11 +2263,13 @@ def _fit(self, x, y, dy, classes): cost_summary = self.tensorboard_logger_training.write_cost_summary(cost) optimiser = self._set_optimiser() - optimisation_op = optimiser.minimize(cost) + optimisation_op = optimiser.minimize(cost, name="optimisation_op") # Initialisation of the variables init = tf.global_variables_initializer() - iterator_init = iterator.make_initializer(dataset) + iterator_init = iterator.make_initializer(dataset, name="dataset_init") + + self._build_model_from_xyz(self.n_atoms, element_weights, element_biases) self.session = tf.Session() @@ -2253,11 +2278,18 @@ def _fit(self, x, y, dy, classes): self.tensorboard_logger_training.set_summary_writer(self.session) self.session.run(init) - self.session.run(iterator_init, feed_dict={x_ph:x_approved, zs_ph:classes_approved, y_ph:y_approved}) + + # Initialising the object that enables graceful killing of the training + killer = _GracefulKiller() for i in range(self.iterations): - self.session.run(iterator_init, feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved}) + if i % 2 == 0: + buff = int(3.5 * batch_size) + else: + buff = int(4.5 * batch_size) + + self.session.run(iterator_init, feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved, buffer_tf:buff}) avg_cost = 0 for j in range(n_batches): @@ -2269,15 +2301,133 @@ def _fit(self, x, y, dy, classes): avg_cost += c - # This seems to run the iterator.get_next() op, which gives problems with end of sequence - # Hence why I re-initialise the iterator - self.session.run(iterator_init, feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved}) + if killer.kill_now: + self.save_nn("emergency_save") + exit() + + # This seems to run the iterator.get_next() op, which gives problems with end of sequence, hence why I re-initialise the iterator if self.tensorboard: if i % self.tensorboard_logger_training.store_frequency == 0: + self.session.run(iterator_init, + feed_dict={x_ph: x_approved, zs_ph: classes_approved, y_ph: y_approved, + buffer_tf: buff}) self.tensorboard_logger_training.write_summary(self.session, i) self.training_cost.append(avg_cost/n_batches) + def _fit_from_loaded(self, x, y, dy, classes): + """ + This function carries on fitting an atomic decomposed network to the data after it has been loaded. + + :param x: either the representations or the indices to the data points to use + :type x: either a numpy array of shape (n_samples, n_atoms, n_features) or a numpy array of ints + :param y: either the properties or None + :type y: either a numpy array of shape (n_samples,) or None + :param dy: None + :type dy: None + :param classes: classes to use for the atomic decomposition or None + :type classes: either a numpy array of shape (n_samples, n_atoms) or None + + :return: None + """ + + x_approved, y_approved, dy_approved, classes_approved = self._check_inputs(x, y, dy, classes) + + + if 0 in classes_approved: + idx_zeros = np.where(classes_approved == 0)[1] + classes_for_elements = classes_approved[:, :idx_zeros[0]] + else: + classes_for_elements = classes_approved + self.elements = self._find_elements(classes_for_elements) + + if self.tensorboard: + self.tensorboard_logger_training.initialise() + self.tensorboard_logger_training.set_summary_writer(self.session) + + self.n_samples = x_approved.shape[0] + self.n_atoms = x_approved.shape[1] + self.n_features = x_approved.shape[2] + + batch_size = self._get_batch_size() + n_batches = ceil(self.n_samples, batch_size) + + graph = tf.get_default_graph() + + with graph.as_default(): + # Reloading all the needed operations and tensors + tf_x = graph.get_tensor_by_name("Data/Descriptors:0") + tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") + tf_ene = graph.get_tensor_by_name("Data/Properties:0") + tf_buffer = graph.get_tensor_by_name("Data/buffer:0") + + optimisation_op = graph.get_operation_by_name("optimisation_op") + dataset_init_op = graph.get_operation_by_name("dataset_init") + + # Initialising the object that enables graceful killing of the training + killer = _GracefulKiller() + + for i in range(self.iterations): + + if i % 2 == 0: + buff = int(3.5 * batch_size) + else: + buff = int(4.5 * batch_size) + + self.session.run(dataset_init_op, feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved, tf_buffer: buff}) + + for j in range(n_batches): + if self.tensorboard: + self.session.run(optimisation_op, options=self.tensorboard_logger_training.options, + run_metadata=self.tensorboard_logger_training.run_metadata) + else: + self.session.run(optimisation_op) + + if killer.kill_now: + self.save_nn("emergency_save") + exit() + + + if self.tensorboard: + if i % self.tensorboard_logger_training.store_frequency == 0: + self.session.run(dataset_init_op, + feed_dict={tf_x: x_approved, tf_zs: classes_approved, tf_ene: y_approved, tf_buffer: buff}) + self.tensorboard_logger_training.write_summary(self.session, i) + + def _build_model_from_xyz(self, n_atoms, element_weights, element_biases): + """ + This function builds a model that makes it possible to predict energies straight from xyz data. It constructs the + graph needed to do this. + + :param n_atoms: number of atoms + :param element_weights: the dictionary of the trained weights in the model + :param element_biases: the dictionary of trained biases in the model + """ + + with tf.name_scope("Inputs_pred"): + zs_tf = tf.placeholder(shape=[None, n_atoms], dtype=tf.int32, name="Classes") + xyz_tf = tf.placeholder(shape=[None, n_atoms, 3], dtype=tf.float32, name="xyz") + + dataset = tf.data.Dataset.from_tensor_slices((xyz_tf, zs_tf)) + dataset = dataset.batch(2) + iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) + batch_xyz, batch_zs = iterator.get_next() + iterator_init = iterator.make_initializer(dataset, name="dataset_init_pred") + + with tf.name_scope("Descriptor_pred"): + batch_representation = generate_acsf_tf(xyzs=batch_xyz, Zs=batch_zs, elements=self.elements, + element_pairs=self.element_pairs, + rcut=self.acsf_parameters['rcut'], + acut=self.acsf_parameters['acut'], + nRs2=self.acsf_parameters['nRs2'], + nRs3=self.acsf_parameters['nRs3'], + nTs=self.acsf_parameters['nTs'], + eta=self.acsf_parameters['eta'], + zeta=self.acsf_parameters['zeta']) + + with tf.name_scope("Model_pred"): + batch_energies_nn = self._model(batch_representation, batch_zs, element_weights, element_biases) + def _predict(self, x, classes): """ This function checks whether x contains indices or data. If it contains indices, the data is extracted by the @@ -2294,6 +2444,7 @@ def _predict(self, x, classes): """ approved_x, approved_classes = self._check_predict_input(x, classes) + empty_ene = np.empty((approved_x.shape[0], 1)) if self.session == None: raise InputError("Model needs to be fit before predictions can be made.") @@ -2303,10 +2454,58 @@ def _predict(self, x, classes): with graph.as_default(): tf_x = graph.get_tensor_by_name("Data/Descriptors:0") tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") + tf_true_ene = graph.get_tensor_by_name("Data/Properties:0") + tf_buffer = graph.get_tensor_by_name("Data/buffer:0") model = graph.get_tensor_by_name("Model/output:0") - y_pred = self.session.run(model, feed_dict={tf_x: approved_x, tf_zs:approved_classes}) + dataset_init_op = graph.get_operation_by_name("dataset_init") + self.session.run(dataset_init_op, feed_dict={tf_x: approved_x, tf_zs: approved_classes, tf_true_ene: empty_ene, tf_buffer:1}) - return y_pred + tot_y_pred = [] + + while True: + try: + y_pred = self.session.run(model) + tot_y_pred.append(y_pred) + except tf.errors.OutOfRangeError: + break + + return np.concatenate(tot_y_pred, axis=0) + + def predict_from_xyz(self, xyz, classes): + """ + This function takes in the cartesian coordinates and the atom types and returns energies. + + :param xyz: cartesian coordinates + :type xyz: numpy array of shape (n_samples, n_atoms, 3) + :param classes: atom types + :type classes: numpy array of shape (n_samples, n_atoms) + :return: energies + :rtype: numpy array of shape (n_samples,) + """ + + if self.session == None: + raise InputError("Model needs to be fit before predictions can be made.") + + graph = tf.get_default_graph() + + with graph.as_default(): + xyz_tf = graph.get_tensor_by_name("Inputs_pred/xyz:0") + classes_tf = graph.get_tensor_by_name("Inputs_pred/Classes:0") + ene_nn = graph.get_tensor_by_name("Model_pred/output:0") + dataset_init_op = graph.get_operation_by_name("Inputs_pred/dataset_init_pred") + self.session.run(dataset_init_op, + feed_dict={xyz_tf: xyz, classes_tf: classes}) + + tot_y_pred = [] + + while True: + try: + y_pred = self.session.run(ene_nn) + tot_y_pred.append(y_pred) + except tf.errors.OutOfRangeError: + break + + return np.concatenate(tot_y_pred, axis=0).ravel() def _score_r2(self, x, y=None, dy=None, classes=None): """ @@ -2395,6 +2594,15 @@ def save_nn(self, save_dir="saved_model"): :return: None """ + counter = 0 + dir = save_dir + while True: + if os.path.isdir(save_dir): + counter += 1 + save_dir = dir + "_" + str(counter) + else: + break + if self.session == None: raise InputError("Model needs to be fit before predictions can be made.") @@ -2403,10 +2611,12 @@ def save_nn(self, save_dir="saved_model"): with graph.as_default(): tf_x = graph.get_tensor_by_name("Data/Descriptors:0") tf_zs = graph.get_tensor_by_name("Data/Atomic-numbers:0") + true_ene = graph.get_tensor_by_name("Data/Properties:0") model = graph.get_tensor_by_name("Model/output:0") tf.saved_model.simple_save(self.session, export_dir=save_dir, - inputs={"Data/Descriptors:0": tf_x, "Data/Atomic-numbers:0": tf_zs}, + inputs={"Data/Descriptors:0": tf_x, "Data/Atomic-numbers:0": tf_zs, + "Data/Properties:0": true_ene}, outputs={"Model/output:0": model}) def load_nn(self, save_dir="saved_model"): @@ -2420,3 +2630,5 @@ def load_nn(self, save_dir="saved_model"): self.session = tf.Session(graph=tf.get_default_graph()) tf.saved_model.loader.load(self.session, [tf.saved_model.tag_constants.SERVING], save_dir) + + self.loaded_model = True diff --git a/qml/aglaia/graceful_killer.py b/qml/aglaia/graceful_killer.py new file mode 100644 index 000000000..d140008e2 --- /dev/null +++ b/qml/aglaia/graceful_killer.py @@ -0,0 +1,17 @@ +""" +Code adapted from Stack Overflow answer https://stackoverflow.com/a/31464349/7146757 + +This class is used to process SIGTERM signal gracefully from within the neural network training. +""" + +import signal + +class _GracefulKiller: + kill_now = False + + def __init__(self): + signal.signal(signal.SIGINT, self.exit_gracefully) + signal.signal(signal.SIGTERM, self.exit_gracefully) + + def exit_gracefully(self,signum, frame): + self.kill_now = True \ No newline at end of file diff --git a/qml/aglaia/np_symm_funct.py b/qml/aglaia/np_symm_funct.py index 2a095302e..5de18bd98 100644 --- a/qml/aglaia/np_symm_funct.py +++ b/qml/aglaia/np_symm_funct.py @@ -171,10 +171,10 @@ def acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, cos_theta_ijk = get_costheta(xyzs[sample, i, :], xyzs[sample, j, :], xyzs[sample, k, :]) theta_ijk = np.arccos(cos_theta_ijk) - term_1 = np.power((1.0 + np.cos(theta_ijk - theta_value)), zeta) + term_1 = np.power((1.0 + np.cos(theta_ijk - theta_value))/2.0, zeta) term_2 = np.exp(- eta * np.power(0.5*(r_ij + r_ik) - rs_value, 2)) term_3 = fc(r_ij, angular_cutoff) * fc(r_ik, angular_cutoff) - g_term = term_1 * term_2 * term_3 * np.power(2.0, 1.0 - zeta) + g_term = term_1 * term_2 * term_3 * 2.0 # Compare the pair of neighbours to all the possible element pairs, then summ accordingly current_pair = np.flip(np.sort([Zs[sample][j], Zs[sample][k]]), axis=0) # Sorting the pair in descending order for m, pair in enumerate(element_pairs): @@ -187,8 +187,8 @@ def acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, return np.asarray(total_descriptor) -def generate_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, angular_cutoff, radial_rs, - angular_rs, theta_s, zeta, eta): +def generate_acsf_np(xyzs, Zs, elements, element_pairs, rcut, acut, nRs2, + nRs3, nTs, zeta, eta): """ This function calculates the symmetry functions used in the tensormol paper. @@ -206,44 +206,13 @@ def generate_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, angular_cuto :return: numpy array of shape (n_samples, n_atoms, n_rad_rs*n_elements + n_ang_rs*n_thetas*n_element_pairs) """ - rad_term = acsf_rad(xyzs, Zs, elements, radial_cutoff, radial_rs, eta) - ang_term = acsf_ang(xyzs, Zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, eta) + radial_rs = np.linspace(0, rcut, nRs2) + angular_rs = np.linspace(0, acut, nRs3) + theta_s = np.linspace(0, np.pi, nTs) + + rad_term = acsf_rad(xyzs, Zs, elements, rcut, radial_rs, eta) + ang_term = acsf_ang(xyzs, Zs, element_pairs, acut, angular_rs, theta_s, zeta, eta) acsf = np.concatenate((rad_term, ang_term), axis=-1) return acsf - -if __name__ == "__main__": - xyzs = np.array([[[0.0, 0.0, 0.0], - [1.0, 0.0, 0.0], - [0.0, 1.0, 0.0], - [0.0, 0.0, 1.0]]]) - - zs = [[7, 2, 1, 1]] - - elements = [1, 2, 7] - element_pairs = [[1, 1], [2, 1], [7, 1], [7, 2]] - - # # input_data = "/Volumes/Transcend/repositories/Aglaia/aglaia/tests/data_test_acsf.npz" - # input_data = "/Volumes/Transcend/repositories/Aglaia/aglaia/tests/qm7_testdata.npz" - # data = np.load(input_data) - # - # xyzs = data["arr_0"] - # zs = data["arr_1"] - # elements = data["arr_2"] - # element_pairs = data["arr_3"] - - radial_cutoff = 1000.0 - angular_cutoff = 1000.0 - radial_rs = [0.0, 1.0] - angular_rs = [0.0, 1.0] - theta_s = [np.pi, np.pi * 0.5] - # radial_rs = [0.0] - # angular_rs = [0.0] - # theta_s = [3.0] - zeta = 0.0 - eta = 1.0 - - rad_term = acsf_rad(xyzs, zs, elements, radial_cutoff, radial_rs, eta) - ang_term = acsf_ang(xyzs, zs, element_pairs, angular_cutoff, angular_rs, theta_s, zeta, eta) - print(rad_term) diff --git a/qml/aglaia/symm_funct.py b/qml/aglaia/symm_funct.py index 805b81c03..84c1eaaad 100644 --- a/qml/aglaia/symm_funct.py +++ b/qml/aglaia/symm_funct.py @@ -37,19 +37,25 @@ def acsf_rad(xyzs, Zs, radial_cutoff, radial_rs, eta): This does the radial part of the symmetry function (G2 function in Behler's papers). It works only for datasets where all samples are the same molecule but in different configurations. - :param xyzs: tf tensor of shape (n_samples, n_atoms, 3) contaning the coordinates of each atom in each data sample - :param Zs: tf tensor of shape (n_samples, n_atoms) containing the atomic number of each atom in each data sample - :param radial_cutoff: scalar tensor - :param radial_rs: tf tensor of shape (n_rs,) with the R_s values - :param eta: tf scalar - - :return: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) + :param xyzs: Coordinates of each atom in each data sample + :type xyzs: tf tensor of shape (n_samples, n_atoms, 3) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param radial_cutoff: cut off used in the radial term + :type radial_cutoff: scalar tensor + :param radial_rs: R_s values + :type radial_rs: tf tensor of shape (n_rs,) + :param eta: parameter in the radial term + :type eta: tf scalar + + :return: pre-sum radial term + :rtype: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) """ # Calculating the distance matrix between the atoms of each sample with tf.name_scope("Distances"): dxyzs = tf.expand_dims(xyzs, axis=2) - tf.expand_dims(xyzs, axis=1) - dist_tensor = tf.cast(tf.norm(dxyzs, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) + dist_tensor = tf.cast(tf.norm(dxyzs+1.e-16, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) # Indices of terms that need to be zero (diagonal elements) mask_0 = tf.zeros(tf.shape(dist_tensor)) @@ -92,20 +98,29 @@ def acsf_ang(xyzs, Zs, angular_cutoff, angular_rs, theta_s, zeta, eta): This does the angular part of the symmetry function as mentioned here: https://arxiv.org/pdf/1711.06385.pdf It only works for systems where all the samples are the same molecule but in different configurations. - :param xyzs: tf tensor of shape (n_samples, n_atoms, 3) contaning the coordinates of each atom in each data sample - :param Zs: tf tensor of shape (n_samples, n_atoms) containing the atomic number of each atom in each data sample - :param angular_cutoff: scalar tensor - :param angular_rs: tf tensor of shape (n_ang_rs,) with the equivalent of the R_s values from the G2 - :param theta_s: tf tensor of shape (n_thetas,) - :param zeta: tf tensor of shape (1,) - :param eta: tf tensor of shape (1,) - :return: tf tensor of shape (n_samples, n_atoms, n_atoms, n_atoms, n_ang_rs * n_thetas) + :param xyzs: Coordinates of each atom in each data sample + :type xyzs: tf tensor of shape (n_samples, n_atoms, 3) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param angular_cutoff: cut off used in the angular term + :type angular_cutoff: scalar tensor + :param angular_rs: angular R_s values + :type angular_rs: tf tensor of shape (n_ang_rs,) + :param theta_s: angular theta_s values + :type theta_s: tf tensor of shape (n_thetas,) + :param zeta: parameter in the angular term + :type zeta: tf tensor of shape (1,) + :param eta: parameter in the angular term + :type eta: tf tensor of shape (1,) + + :return: pre-sum angular term + :rtype: tf tensor of shape (n_samples, n_atoms, n_atoms, n_atoms, n_ang_rs * n_thetas) """ # Finding the R_ij + R_ik term with tf.name_scope("Sum_distances"): dxyzs = tf.expand_dims(xyzs, axis=2) - tf.expand_dims(xyzs, axis=1) - dist_tensor = tf.cast(tf.norm(dxyzs, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) + dist_tensor = tf.cast(tf.norm(dxyzs+1.e-16, axis=3), dtype=tf.float32) # (n_samples, n_atoms, n_atoms) # This is the tensor where element sum_dist_tensor[0,1,2,3] is the R_12 + R_13 in the 0th data sample sum_dist_tensor = tf.expand_dims(dist_tensor, axis=3) + tf.expand_dims(dist_tensor, @@ -163,7 +178,7 @@ def acsf_ang(xyzs, Zs, angular_cutoff, angular_rs, theta_s, zeta, eta): # Dividing the dot products by the magnitudes to obtain cos theta cos_theta = tf.divide(dots_dxyzs, dist_prod) # Taking care of the values that due numerical error are just above 1.0 or below -1.0 - cut_cos_theta = tf.clip_by_value(cos_theta, tf.constant(-1.0), tf.constant(1.0)) + cut_cos_theta = tf.clip_by_value(cos_theta, tf.constant(-1.0 + 1.0e-7), tf.constant(1.0 - 1.0e-7)) # Applying arc cos to find the theta value theta = tf.acos(cut_cos_theta) # (n_samples, n_atoms, n_atoms, n_atoms) # Removing the NaNs created by dividing by zero @@ -199,16 +214,18 @@ def acsf_ang(xyzs, Zs, angular_cutoff, angular_rs, theta_s, zeta, eta): # Subtracting them and do the cos cos_theta_term = tf.cos( tf.subtract(expanded_theta, expanded_theta_s)) # (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) - # Make the whole cos term of the sum - cos_term = tf.pow(tf.add(tf.ones(tf.shape(cos_theta_term), dtype=tf.float32), cos_theta_term), - zeta) # (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) + # Make the whole cos term of the sum of shape (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) + cos_term = tf.pow(tf.divide(tf.add(tf.ones(tf.shape(cos_theta_term), dtype=tf.float32), cos_theta_term), + tf.constant(2, dtype=tf.float32)), zeta) + # cos_term = tf.pow(tf.add(tf.ones(tf.shape(cos_theta_term), dtype=tf.float32), cos_theta_term), + # zeta) # (n_samples, n_atoms, n_atoms, n_atoms, n_theta_s) # Final product of terms inside the sum time by 2^(1-zeta) expanded_fc = tf.expand_dims(tf.expand_dims(cleaner_fc_term, axis=-1), axis=-1, name="Expanded_fc") expanded_cos = tf.expand_dims(cos_term, axis=-2, name="Expanded_cos") expanded_exp = tf.expand_dims(exp_term, axis=-1, name="Expanded_exp") - const = tf.pow(tf.constant(2.0, dtype=tf.float32), (1.0 - zeta)) + const = tf.constant(2.0, dtype=tf.float32) with tf.name_scope("Ang_term"): prod_of_terms = const * tf.multiply(tf.multiply(expanded_cos, expanded_exp), @@ -226,11 +243,17 @@ def sum_rad(pre_sum, Zs, elements_list, radial_rs): Sum of the terms in the radial part of the symmetry function. The terms corresponding to the same neighbour identity are summed together. - :param pre_sum: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) - :param Zs: tf tensor of shape (n_samples, n_atoms) - :param elements_list: np.array of shape (n_elements,) - :param radial_rs: tf tensor of shape (n_rad_rs,) - :return: tf tensor of shape (n_samples, n_atoms, n_rad_rd * n_elements) + :param pre_sum: pre-sum radial term + :type pre_sum: tf tensor of shape (n_samples, n_atoms, n_atoms, n_rs) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param elements_list: unique atoms in all of the samples + :type elements_list: np.array of shape (n_elements,) + :param radial_rs: radial R_s values + :type radial_rs: tf tensor of shape (n_rad_rs,) + + :return: the radial term + :rtype: tf tensor of shape (n_samples, n_atoms, n_rad_rd * n_elements) """ n_atoms = Zs.get_shape().as_list()[1] n_elements = len(elements_list) @@ -268,12 +291,19 @@ def sum_ang(pre_sumterm, Zs, element_pairs_list, angular_rs, theta_s): This function does the sum of the terms in the radial part of the symmetry function. Three body interactions where the two neighbours are the same elements are summed together. - :param pre_sumterm: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas) - :param Zs: tf tensor of shape (n_samples, n_atoms) - :param element_pairs_list: np array of shape (n_elementpairs, 2) - :param angular_rs: tf tensor of shape (n_ang_rs,) - :param theta_s: tf tensor of shape (n_thetas,) - :return: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas * n_elementpairs) + :param pre_sumterm: pre-sum angular term + :type pre_sumterm: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tf tensor of shape (n_samples, n_atoms) + :param element_pairs_list: list of all the atom element pairs + :type element_pairs_list: np array of shape (n_elementpairs, 2) + :param angular_rs: angular R_s values + :type angular_rs: tf tensor of shape (n_ang_rs,) + :param theta_s: angular theta_s values + :type theta_s: tf tensor of shape (n_thetas,) + + :return: the angular term + :rtype: tf tensor of shape (n_samples, n_atoms, n_ang_rs * n_thetas * n_elementpairs) """ n_atoms = Zs.get_shape().as_list()[1] @@ -350,31 +380,48 @@ def sum_ang(pre_sumterm, Zs, element_pairs_list, angular_rs, theta_s): return clean_final_term -def generate_parkhill_acsf(xyzs, Zs, elements, element_pairs, radial_cutoff, angular_cutoff, - radial_rs, angular_rs, theta_s, zeta, eta): +def generate_acsf_tf(xyzs, Zs, elements, element_pairs, rcut, acut, + nRs2, nRs3, nTs, zeta, eta): """ This function generates the atom centred symmetry function as used in the Tensormol paper. Currently only tested for single systems with many conformations. It requires the coordinates of all the atoms in each data sample, the atomic charges for each atom (in the same order as the xyz), the overall elements and overall element pairs. Then it requires the parameters for the ACSF that are used in the Tensormol paper: https://arxiv.org/pdf/1711.06385.pdf - :param xyzs: tensor of shape (n_samples, n_atoms, 3) - :param Zs: tensor of shape (n_samples, n_atoms) - :param elements: np.array of shape (n_elements,) - :param element_pairs: np.array of shape (n_elementpairs, 2) - :param radial_cutoff: scalar float - :param angular_cutoff: scalar float - :param radial_rs: np.array of shape (n_rad_rs,) - :param angular_rs: np.array of shape (n_ang_rs,) - :param theta_s: np.array of shape (n_thetas,) - :param zeta: scalar float - :param eta: scalar float - :return: a tf tensor of shape (n_samples, n_atoms, n_rad_rs * n_elements + n_ang_rs * n_thetas * n_elementpairs) + :param xyzs: Coordinates of each atom in each data sample + :type xyzs: tensor of shape (n_samples, n_atoms, 3) + :param Zs: Atomic number of each atom in each data sample + :type Zs: tensor of shape (n_samples, n_atoms) + :param elements: unique atoms in all of the samples + :type elements: np.array of shape (n_elements,) + :param element_pairs: list of all the atom element pairs + :type element_pairs: np.array of shape (n_elementpairs, 2) + :param rcut: radial cut off length + :type rcut: scalar float + :param acut: angular cut off length + :type acut: scalar float + :param nRs2: number of distance bins to use in the radial term + :type nRs2: positive integer + :param nRs3: number of distance bins to use in the angular term + :type nRs3: positive integer + :param nTs: number of angle bins to use in the angular term + :type nTs: positive integer + :param zeta: parameter in the cosine term + :type zeta: scalar float + :param eta: parameter in the exponential terms + :type eta: scalar float + + :return: the atom centred symmetry functions + :rtype: a tf tensor of shape a tf tensor of shape (n_samples, n_atoms, nRs2 * n_elements + nRs3 * nTs * n_elementpairs) """ + radial_rs = np.linspace(0, rcut, nRs2) + angular_rs = np.linspace(0, acut, nRs3) + theta_s = np.linspace(0, np.pi, nTs) + with tf.name_scope("acsf_params"): - rad_cutoff = tf.constant(radial_cutoff, dtype=tf.float32) - ang_cutoff = tf.constant(angular_cutoff, dtype=tf.float32) + rad_cutoff = tf.constant(rcut, dtype=tf.float32) + ang_cutoff = tf.constant(acut, dtype=tf.float32) rad_rs = tf.constant(radial_rs, dtype=tf.float32) ang_rs = tf.constant(angular_rs, dtype=tf.float32) theta_s = tf.constant(theta_s, dtype=tf.float32) diff --git a/qml/aglaia/tf_utils.py b/qml/aglaia/tf_utils.py index 1bc6567d3..5dc73aea5 100644 --- a/qml/aglaia/tf_utils.py +++ b/qml/aglaia/tf_utils.py @@ -26,7 +26,9 @@ """ import os +import numpy as np import tensorflow as tf +from qml.utils import ceil class TensorBoardLogger(object): """ @@ -71,3 +73,50 @@ def write_weight_histogram(self, weights): def write_cost_summary(self, cost): tf.summary.scalar('cost', cost) + +def generate_weights(n_in, n_out, hl): + + weights = [] + biases = [] + + # Support for linear models + if len(hl) == 0: + # Weights from input layer to first hidden layer + w = tf.Variable(tf.truncated_normal([n_out, n_in], stddev = 1.0, dtype = tf.float32), + dtype = tf.float32, name = "weights_in") + b = tf.Variable(tf.zeros([n_out], dtype = tf.float32), name="bias_in", dtype = tf.float32) + + weights.append(w) + biases.append(b) + + return weights, biases + + # Weights from input layer to first hidden layer + w = tf.Variable(tf.truncated_normal([hl[0], n_in], stddev = 1.0 / np.sqrt(hl[0]), dtype = tf.float32), + dtype = tf.float32, name = "weights_in") + b = tf.Variable(tf.zeros([hl[0]], dtype = tf.float32), name="bias_in", dtype = tf.float32) + + weights.append(w) + biases.append(b) + + # Weights from one hidden layer to the next + for i in range(1, len(hl)): + w = tf.Variable(tf.truncated_normal([hl[i], hl[i-1]], stddev=1.0 / np.sqrt(hl[i-1]), dtype=tf.float32), + dtype=tf.float32, name="weights_hidden_%d" % i) + b = tf.Variable(tf.zeros([hl[i]], dtype=tf.float32), name="bias_hidden_%d" % i, dtype=tf.float32) + + weights.append(w) + biases.append(b) + + + # Weights from last hidden layer to output layer + w = tf.Variable(tf.truncated_normal([n_out, hl[-1]], + stddev=1.0 / np.sqrt(hl[-1]), dtype=tf.float32), + dtype=tf.float32, name="weights_out") + b = tf.Variable(tf.zeros([n_out], dtype=tf.float32), name="bias_out", dtype=tf.float32) + + weights.append(w) + biases.append(b) + + return weights, biases + diff --git a/qml/qmlearn/models.py b/qml/qmlearn/models.py index f53c02d1a..cc6dc7ca8 100644 --- a/qml/qmlearn/models.py +++ b/qml/qmlearn/models.py @@ -26,10 +26,20 @@ from sklearn.base import BaseEstimator from sklearn.metrics import mean_absolute_error -from ..utils import is_numeric_array +try: + import tensorflow as tf + from ..aglaia.tf_utils import generate_weights +except ImportError: + tf = None + +from ..utils import is_numeric_array, is_numeric, is_numeric_1d_array, is_positive_integer_1d_array +from ..utils import get_unique, is_positive_integer_or_zero, get_batch_size, is_positive_integer +from ..utils import is_positive, is_positive_or_zero +from ..utils import InputError from .data import Data from ..math import cho_solve + class _BaseModel(BaseEstimator): """ Base class for all regression models @@ -59,7 +69,7 @@ def score(self, X, y=None): y_pred = self.predict(X) # Get the true values - if is_numeric_array(y): + if is_numeric_1d_array(y): pass elif isinstance(X, Data): @@ -85,18 +95,22 @@ def score(self, X, y=None): elif self.scoring == 'neg_log_mae': return - np.log(mean_absolute_error(y, y_pred)) + def _set_scoring(self, scoring): + if not scoring in ['mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae']: + raise InputError("Unknown scoring function") + + self.scoring = scoring + class KernelRidgeRegression(_BaseModel): """ - Standard Kernel Ridge Regression using a cholesky solver + Standard Kernel Ridge Regression using a Cholesky solver + :param l2_reg: l2 regularization + :type l2_reg: float + :param scoring: Metric used for scoring ('mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae') + :type scoring: string """ def __init__(self, l2_reg=1e-10, scoring='neg_mae'): - """ - :param llambda: l2 regularization - :type llambda: float - :param scoring: Metric used for scoring ('mae', 'neg_mae', 'rmsd', 'neg_rmsd', 'neg_log_mae') - :type scoring: string - """ self.l2_reg = l2_reg self.scoring = scoring @@ -159,3 +173,578 @@ def predict(self, X): raise SystemExit return np.dot(K, self.alpha) + +class NeuralNetwork(_BaseModel): + """ + Feed forward neural network that takes molecular or atomic representations for predicting molecular properties. + + :param hl1: number of hidden nodes in the 1st hidden layer + :type hl1: int + :param hl2: number of hidden nodes in the 2nd hidden layer + :type hl2: int + :param hl3: number of hidden nodes in the 3rd hidden layer + :type hl3: int + :param hl4: number of hidden nodes in the 4th hidden layer + :type hl4: int + :param batch_size: Number of samples to have in each batch during training + :type batch_size: int + :param learning_rate: Step size in optimisation algorithm + :type learning_rate: float + :param iterations: Number of iterations to do during training + :type iterations: int + :param l1_reg: L1 regularisation parameter + :type l1_reg: float + :param l2_reg: L2 regularisation parameter + :type l2_reg: float + :param scoring: What function to use for scoring. Available options are "neg_mae", "mae", "rmsd", "neg_rmsd", + "neg_log_mae" + :type scoring: string + :param size: Maximum number of atoms in a molecule to support. 'auto' will determine this automatically. + Is ignored when using molecular representations. + :type size: int + """ + + def __init__(self, hl1=20, hl2=10, hl3=5, hl4=0, batch_size=200, learning_rate=0.001, iterations=500, l1_reg=0.0, + l2_reg=0.0, scoring="neg_mae", size='auto'): + + # Check if tensorflow is available + self.tf_check() + + self._set_hl(hl1, hl2, hl3, hl4) + self._set_batch_size(batch_size) + self._set_learning_rate(learning_rate) + self._set_iterations(iterations) + self._set_reg(l1_reg, l2_reg) + self._set_scoring(scoring) + self._set_size(size) + + # Will be overwritten at fit time + self.elements = None + self._representation_type = None + self._constant_features = None + + def _set_hl(self, hl1, hl2, hl3, hl4): + + for hl in hl1, hl2, hl3, hl4: + if not is_positive_integer_or_zero(hl1): + print("Error: Expected the number of hidden nodes in a layer to be positive. Got %s" % str(hl)) + raise SystemExit + + self.hl1 = hl1 + self.hl2 = hl2 + self.hl3 = hl3 + self.hl4 = hl4 + + def _set_batch_size(self, bs): + if not is_positive_integer(bs) or int(bs) == 1: + print("'batch_size' should be larger than 1.") + raise SystemExit + + self.batch_size = bs + + def _set_size(self, size): + if not is_positive_integer(size) and size != 'auto': + print("Variable 'size' should be a positive integer. Got %s" % str(size)) + raise SystemExit + + self.size = size + + def _set_learning_rate(self, lr): + if not is_positive(lr): + print("Expected positive float value for variable 'learning_rate'. Got %s" % str(lr)) + raise SystemExit + + self.learning_rate = lr + + def _set_iterations(self, it): + if not is_positive_integer(it): + print("Expected positive integer value for variable 'iterations'. Got %s" % str(it)) + raise SystemExit + + self.iterations = it + + def _set_reg(self, l1_reg, l2_reg): + if not is_positive_or_zero(l1_reg) or not is_positive_or_zero(l2_reg): + raise InputError("Expected positive float value for regularisation variables 'l1_reg' and 'l2_reg. Got %s and %s" % (str(l1_reg), str(l2_reg))) + + self.l1_reg = l1_reg + self.l2_reg = l2_reg + + def fit(self, X, y=None, nuclear_charges=None): + """ + Fit the neural network. A molecular/atomic network will be fit if + molecular/atomic representations are given. + + :param X: Data object or representations + :type X: Data object or array + :param y: Energies. Only used if X is representations. + :type y: numpy array of shape (n_samples,) + :param nuclear_charges: Nuclear charges. Only used if X is representations. + :type nuclear_charges: array + """ + + # Obtaining the representations and the energies from the data object + if isinstance(X, Data): + if y is not None or nuclear_charges is not None: + print("Parameter 'y' and 'nuclear_charges' are expected to be 'None'", + "when 'X' is a Data object") + raise SystemExit + + representations = X._representations + nuclear_charges = X.nuclear_charges[X._indices] + # 2D for tensorflow + energies = np.reshape(X.energies[X._indices], (-1,1)) + self._representation_type = X._representation_type + else: + # y must be an array + if not is_numeric_1d_array(y): + print("Expected parameter 'y' to be a numeric 1d-array. Got %s" % type(y)) + raise SystemExit + + # 2D for tensorflow + energies = np.reshape(y, (-1,1)) + representations = np.asarray(X) + + # Check if the representations are molecular or atomic + if len(representations) > 0 and len(representations[0]) > 0: + item = representations[0][0] + if is_numeric(item): + self._representation_type = 'molecular' + elif is_numeric_1d_array(item): + self._representation_type = 'atomic' + else: + print("Could not recognise format of representations (parameter 'X')") + raise SystemExit + else: + print("Could not recognise format of representations (parameter 'X')") + raise SystemExit + + # Check that nuclear charges are also provided if the representation is atomic + if self._representation_type == 'atomic' and nuclear_charges is None: + print("Expected parameter 'nuclear_charges' to be a numeric array.") + raise SystemExit + + if self._representation_type == 'atomic': + representations, nuclear_charges = self._padding(representations, nuclear_charges) + + # Get all unique elements + self.elements = get_unique(nuclear_charges) + + # Remove constant features + representations = self._remove_constant_features(representations) + + # Generate the model + self._generate_model(representations.shape) + # Train the model + self._train(representations, energies, nuclear_charges) + + def _remove_constant_features(self, representations): + """ + Removes constant features of the representations, and stores + which features should be removed at predict time. + """ + + if self._representation_type == 'atomic': + # Due to how the atomic neural network iś constructed, + # this cannot be done elementwise + rep = representations.reshape(-1, representations.shape[-1]) + if self._constant_features is None: + self._constant_features = np.all(rep == rep[0], axis=0) + + rep = rep[:,~self._constant_features] + rep = rep.reshape(representations.shape[0:2] + (-1,)) + + return rep + + else: + if self._constant_features is None: + self._constant_features = np.all(representations == representations[0], axis=0) + + return representations[:,~self._constant_features] + + def predict(self, X, nuclear_charges=None): + """ + Predict the molecular properties from a trained model. + + :param X: Data object + :type X: object from class Data + :param nuclear_charges: Nuclear charges. Only used if X is representations. + :type nuclear_charges: array + :return: predicted properties + :rtype: numpy array of shape (n_samples,) + """ + + if self._representation_type is None: + print("The model have not been fitted") + raise SystemExit + + if isinstance(X, Data): + if self._representation_type == 'molecular' and nuclear_charges is not None: + print("Parameter 'nuclear_charges' is expected to be 'None'", + "when 'X' is a Data object") + raise SystemExit + + representations, nuclear_charges = X._representations, X.nuclear_charges[X._indices] + else: + representations = X + + if self._representation_type == 'atomic' and nuclear_charges is None: + print("Parameter 'nuclear_charges' is expected to be numeric array") + raise SystemExit + + if self._representation_type == "atomic": + representations, nuclear_charges = self._padding(representations, nuclear_charges) + + # Remove constant features + representations = self._remove_constant_features(representations) + + y_pred = self._predict(representations, nuclear_charges) + + return y_pred + + # TODO support for different activation functions + def _generate_model(self, shape): + """ + Generate the Tensorflow graph for the molecular feed forward neural net. + + :param shape: Shape of representations + :type shape: tuple + """ + + n_features = shape[-1] + + self.graph = tf.Graph() + + hidden_layers = [] + for item in [self.hl1, self.hl2, self.hl3, self.hl4]: + if item != 0: + hidden_layers.append(int(item)) + else: + break + + # Initial set up of the NN + with self.graph.as_default(): + with tf.name_scope("Data"): + ph_y = tf.placeholder(tf.float32, [None, 1], name="True_energies") + if self._representation_type == 'atomic': + n_atoms = shape[1] + ph_zs = tf.placeholder(dtype=tf.int32, shape=[None, n_atoms], name="Atomic-numbers") + ph_x = tf.placeholder(tf.float32, [None, n_atoms, n_features], name="Representation") + else: + ph_x = tf.placeholder(tf.float32, [None, n_features], name="Representation") + # Has to be int64 for tf.data.Dataset + batch_size_tf = tf.placeholder(dtype=tf.int64, name="Batch_size") + buffer_tf = tf.placeholder(dtype=tf.int64, name="Buffer") + + if self._representation_type == 'atomic': + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_zs, ph_y)) + else: + dataset = tf.data.Dataset.from_tensor_slices((ph_x, ph_y)) + + dataset = dataset.shuffle(buffer_size=buffer_tf) + dataset = dataset.batch(batch_size_tf) + + iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes) + + if self._representation_type == 'atomic': + tf_x, tf_zs, tf_y = iterator.get_next() + else: + tf_x, tf_y = iterator.get_next() + + with tf.name_scope("Weights"): + if self._representation_type == 'atomic': + weights = {} + biases = {} + + for i, element in enumerate(self.elements): + w, b = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) + weights[element] = w + biases[element] = b + else: + weights, biases = generate_weights(n_in=n_features, n_out=1, hl=hidden_layers) + + with tf.name_scope("Model"): + if self._representation_type == 'atomic': + all_atomic_energies = tf.zeros_like(tf_zs, dtype=tf.float32) + + for el in self.elements: + # Obtaining the indices where the current element occurs + current_element = tf.expand_dims(tf.constant(el, dtype=tf.int32), axis=0) + where_element = tf.cast(tf.where(tf.equal(tf_zs, current_element)), dtype=tf.int32) + + # Extract the descriptor corresponding to the right element + current_element_in_x = tf.gather_nd(tf_x, where_element) + + # Calculate the atomic energy of all the atoms of type equal to the current element + atomic_ene = self._atomic_nn(current_element_in_x, hidden_layers, weights[el], + biases[el]) + + # Put the atomic energies in a zero array with shape equal to zs and then add it to all the atomic energies + updates = tf.scatter_nd(where_element, atomic_ene, tf.shape(tf_zs)) + all_atomic_energies = tf.add(all_atomic_energies, updates) + + # Summing the energies of all the atoms + y_pred = tf.reduce_sum(all_atomic_energies, axis=-1, name="Predicted_energies", keepdims=True) + + else: + if len(hidden_layers) > 0: + z = tf.add(tf.matmul(tf_x, tf.transpose(weights[0])), biases[0]) + h = tf.sigmoid(z) + + # Calculate the activation of the remaining hidden layers + for i in range(1, len(weights) - 1): + z = tf.add(tf.matmul(h, tf.transpose(weights[i])), biases[i]) + h = tf.sigmoid(z) + + # Calculating the output of the last layer + y_pred = tf.add(tf.matmul(h, tf.transpose(weights[-1])), biases[-1], name="Predicted_energies") + + with tf.name_scope("Cost_func"): + cost = self._cost(y_pred, tf_y, weights) + + with tf.name_scope("Optimiser"): + optimisation_op = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(cost) + + iter_init_op = iterator.make_initializer(dataset, name="dataset_init") + + def _atomic_nn(self, x, hidden_layer_sizes, weights, biases): + """ + Constructs the atomic part of the network. It calculates the atomic property for all atoms and then sums them + together to obtain the molecular property. + + :param x: Atomic representation + :type x: tf tensor of shape (n_samples, n_atoms, n_features) + :param weights: Weights used in the network for a particular element. + :type weights: list of tf.Variables of length hidden_layer_sizes.size + 1 + :param biases: Biases used in the network for a particular element. + :type biases: list of tf.Variables of length hidden_layer_sizes.size + 1 + :return: Output + :rtype: tf.Variable of size (n_samples, n_atoms) + """ + + # Calculate the activation of the first hidden layer + z = tf.add(tf.tensordot(x, tf.transpose(weights[0]), axes=1), biases[0]) + h = tf.sigmoid(z) + + # Calculate the activation of the remaining hidden layers + for i in range(len(hidden_layer_sizes) - 1): + z = tf.add(tf.tensordot(h, tf.transpose(weights[i + 1]), axes=1), biases[i + 1]) + h = tf.sigmoid(z) + + # Calculating the output of the last layer + z = tf.add(tf.tensordot(h, tf.transpose(weights[-1]), axes=1), biases[-1]) + + z_squeezed = tf.squeeze(z, axis=[-1]) + + return z_squeezed + + def _train(self, representations, energies, nuclear_charges): + """ + Train the neural network. + + :param representations: the representations of the molecules (either atomic or molecular) + :type representations: numpy array of shape (n_samples, max_n_atoms, n_features) or (n_samples, n_features) + :param energies: the true molecular properties to be learnt + :type energies: numpy array of shape (n_samples, ) + :param nuclear_charges: the nuclear charges of all the atoms in the molecules + :type nuclear_charges: numpy array of shape (n_samples, max_n_atoms) + :param representation_type: flag setting whether the representation is "atomic" or "molecular". + :type representation_type: string + """ + + batch_size = get_batch_size(self.batch_size, representations.shape[0]) + + with self.graph.as_default(): + tf_x = self.graph.get_tensor_by_name("Data/Representation:0") + tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") + if self._representation_type == "atomic": + tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") + batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") + buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") + + optimisation_op = self.graph.get_operation_by_name("Optimiser/Adam") + iter_init_op = self.graph.get_operation_by_name("dataset_init") + + self.session = tf.Session(graph=self.graph) + + self.session.run(tf.global_variables_initializer()) + + for i in range(self.iterations): + + # Alternate buffer size to improve global shuffling (Not sure that this is needed) + if i % 2 == 0: + buff = int(3.5 * batch_size) + else: + buff = int(4.5 * batch_size) + + if self._representation_type == "atomic": + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, tf_zs: nuclear_charges, buffer_tf: buff, + batch_size_tf: batch_size}) + else: + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: energies, buffer_tf: buff, + batch_size_tf:batch_size}) + + # Iterate to the end of the given data + while True: + try: + self.session.run(optimisation_op) + except tf.errors.OutOfRangeError: + break + + def _cost(self, y_pred, y, weights): + """ + Constructs the cost function + + :param y_pred: Predicted molecular properties + :type y_pred: tf.Variable of size (n_samples, 1) + :param y: True molecular properties + :type y: tf.placeholder of shape (n_samples, 1) + :param weights: Weights used in the network. + :type weights: list of tf.Variables of length hidden_layer_sizes.size + 1 + :return: Cost + :rtype: tf.Variable of size (1,) + """ + + err = tf.square(tf.subtract(y, y_pred)) + loss = tf.reduce_mean(err, name="loss") + + if self._representation_type == 'atomic': + cost = loss + if self.l2_reg > 0: + l2_loss = 0 + for element in weights: + l2_loss += self._l2_loss(weights[element]) + cost += l2_loss + if self.l1_reg > 0: + l1_loss = 0 + for element in weights: + l1_loss += self._l1_loss(weights[element]) + cost += l1_loss + else: + cost = loss + if self.l2_reg > 0: + l2_loss = self._l2_loss(weights) + cost += l2_loss + if self.l1_reg > 0: + l1_loss = self._l1_loss(weights) + cost += l1_loss + + return cost + + def _l2_loss(self, weights): + """ + Creates the expression for L2-regularisation on the weights + + :param weights: tensorflow tensors representing the weights + :type weights: list of tf tensors + :return: tensorflow scalar representing the regularisation contribution to the cost function + :rtype: tf.float32 + """ + + reg_term = tf.zeros([], name="l2_loss") + + for i in range(len(weights)): + reg_term += tf.reduce_sum(tf.square(weights[i])) + + return self.l2_reg * reg_term + + def _l1_loss(self, weights): + """ + Creates the expression for L1-regularisation on the weights + + :param weights: tensorflow tensors representing the weights + :type weights: list of tf tensors + :return: tensorflow scalar representing the regularisation contribution to the cost function + :rtype: tf.float32 + """ + + reg_term = tf.zeros([], name="l1_loss") + + for i in range(len(weights)): + reg_term += tf.reduce_sum(tf.abs(weights[i])) + + return self.l1_reg * reg_term + + def _predict(self, representations, nuclear_charges=None): + """ + Predicts the molecular properties from the representations. + + :param representations: representations of the molecules, can be atomic or molecular + :type representations: numpy array of shape (n_samples, n_max_atoms, n_features) or (n_samples, n_features) + :param nuclear_charges: nuclear charges of the molecules. Only required for atomic representations. + :type nuclear_charges: numpy array of shape (n_samples, n_max_atoms) + :return: molecular properties predictions + :rtype: numpy array of shape (n_samples,) + """ + + batch_size = get_batch_size(self.batch_size, representations.shape[0]) + + with self.graph.as_default(): + tf_x = self.graph.get_tensor_by_name("Data/Representation:0") + tf_y = self.graph.get_tensor_by_name("Data/True_energies:0") + if self._representation_type == "atomic": + tf_zs = self.graph.get_tensor_by_name("Data/Atomic-numbers:0") + batch_size_tf = self.graph.get_tensor_by_name("Data/Batch_size:0") + buffer_tf = self.graph.get_tensor_by_name("Data/Buffer:0") + iter_init_op = self.graph.get_operation_by_name("dataset_init") + + y_pred = self.graph.get_tensor_by_name("Model/Predicted_energies:0") + + if self._representation_type == "atomic": + self.session.run(iter_init_op, feed_dict={tf_x: representations, tf_y: np.empty((representations.shape[0], 1)), + tf_zs: nuclear_charges, buffer_tf: 1, batch_size_tf: batch_size}) + else: + self.session.run(iter_init_op, + feed_dict={tf_x: representations, tf_y: np.empty((representations.shape[0], 1)), + batch_size_tf: batch_size, buffer_tf:1}) + + tot_y_pred = [] + + # Predict until reaching the end of the iterator + while True: + try: + ene_predictions = self.session.run(y_pred) + tot_y_pred.append(ene_predictions) + except tf.errors.OutOfRangeError: + break + + return np.concatenate(tot_y_pred, axis=0).ravel() + + def _padding(self, representation, nuclear_charges): + """ + This function takes atomic representations for molecules of different sizes and pads them with 0 so that they + all have the same size. + + :param representation: list of numby arrays of shape (n_atoms, n_features) + :param nuclear_charges: list of numpy arrays of shape (n_atoms,) + :return: numpy array of shape (n_samples, max_n_atoms, n_features) and (n_samples, max_n_atoms) + """ + + max_n_atoms = 0 + + for i in range(len(representation)): + n_atoms = representation[i].shape[0] + if n_atoms > max_n_atoms: + max_n_atoms = n_atoms + + if self.size == 'auto': + self.size = max_n_atoms + + elif max_n_atoms > self.size: + print("Trying to predict on larger molecules than given by the 'size' parameter at initialization") + raise SystemExit + + padded_rep = np.zeros((len(representation), max_n_atoms, representation[0].shape[1])) + padded_zs = np.zeros((len(representation), max_n_atoms)) + + for i in range(len(representation)): + n_atoms = representation[i].shape[0] + padded_rep[i, :n_atoms] = representation[i] + padded_zs[i, :n_atoms] = nuclear_charges[i] + + return padded_rep, padded_zs + + def tf_check(self): + if tf is None: + print("Tensorflow not found but is needed for %s" % self.__class__.__name__) + raise SystemExit diff --git a/qml/qmlearn/representations.py b/qml/qmlearn/representations.py index 5cece8918..fb9c13bde 100644 --- a/qml/qmlearn/representations.py +++ b/qml/qmlearn/representations.py @@ -153,48 +153,44 @@ class _AtomicRepresentation(_BaseRepresentation): class CoulombMatrix(_MolecularRepresentation): """ Coulomb Matrix representation as described in 10.1103/PhysRevLett.108.058301 + Sorting of the elements can either be done by ``sorting="row-norm"`` or ``sorting="unsorted"``. + A matrix :math:`M` is constructed with elements - """ - - _representation_short_name = "cm" + .. math:: - def __init__(self, data=None, size='auto', sorting="row-norm"): - """ - Coulomb Matrix representation of a molecule. - Sorting of the elements can either be done by ``sorting="row-norm"`` or ``sorting="unsorted"``. - A matrix :math:`M` is constructed with elements + M_{ij} = + \\begin{cases} + \\tfrac{1}{2} Z_{i}^{2.4} & \\text{if } i = j \\\\ + \\frac{Z_{i}Z_{j}}{\\| {\\bf R}_{i} - {\\bf R}_{j}\\|} & \\text{if } i \\neq j + \\end{cases}, - .. math:: + where :math:`i` and :math:`j` are atom indices, :math:`Z` is nuclear charge and + :math:`\\bf R` is the coordinate in euclidean space. + If ``sorting = 'row-norm'``, the atom indices are reordered such that - M_{ij} = - \\begin{cases} - \\tfrac{1}{2} Z_{i}^{2.4} & \\text{if } i = j \\\\ - \\frac{Z_{i}Z_{j}}{\\| {\\bf R}_{i} - {\\bf R}_{j}\\|} & \\text{if } i \\neq j - \\end{cases}, + :math:`\\sum_j M_{1j}^2 \\geq \\sum_j M_{2j}^2 \\geq ... \\geq \\sum_j M_{nj}^2` - where :math:`i` and :math:`j` are atom indices, :math:`Z` is nuclear charge and - :math:`\\bf R` is the coordinate in euclidean space. - If ``sorting = 'row-norm'``, the atom indices are reordered such that + The upper triangular of M, including the diagonal, is concatenated to a 1D + vector representation. - :math:`\\sum_j M_{1j}^2 \\geq \\sum_j M_{2j}^2 \\geq ... \\geq \\sum_j M_{nj}^2` + If ``sorting = 'unsorted``, the elements are sorted in the same order as the input coordinates + and nuclear charges. - The upper triangular of M, including the diagonal, is concatenated to a 1D - vector representation. + The representation is calculated using an OpenMP parallel Fortran routine. - If ``sorting = 'unsorted``, the elements are sorted in the same order as the input coordinates - and nuclear charges. + :param size: The size of the largest molecule supported by the representation. + `size='auto'` will try to determine this automatically. + :type size: integer + :param sorting: How the atom indices are sorted ('row-norm', 'unsorted') + :type sorting: string + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + """ - The representation is calculated using an OpenMP parallel Fortran routine. + _representation_short_name = "cm" - :param size: The size of the largest molecule supported by the representation. - `size='auto'` will try to determine this automatically. - :type size: integer - :param sorting: How the atom indices are sorted ('row-norm', 'unsorted') - :type sorting: string - :param data: Optional Data object containing all molecules used in training \ - and/or prediction - :type data: Data object - """ + def __init__(self, data=None, size='auto', sorting="row-norm"): self.size = size self.sorting = sorting @@ -632,28 +628,26 @@ def transform(self, X): class AtomCenteredSymmetryFunctions(_AtomicRepresentation): """ The variant of Atom-Centered Symmetry Functions used in 10.1039/C7SC04934J + :param data: Optional Data object containing all molecules used in training \ + and/or prediction + :type data: Data object + :param nbasis: Number of basis functions to use + :type nbasis: integer + :param cutoff: Cutoff radius + :type cutoff: float + :param precision: Precision in the basis functions. A value of 2 corresponds to the \ + basis functions intersecting at half maximum. Higher values makes \ + the basis functions narrower. + :type precision: float + :param elements: Atomnumber of elements that the representation should support. + `elements='auto'` will try to determine this automatically. + :type elements: list """ _representation_short_name = "acsf" alchemy = False def __init__(self, data=None, nbasis=3, precision=2, cutoff=5.0, elements='auto'): - """ - :param data: Optional Data object containing all molecules used in training \ - and/or prediction - :type data: Data object - :param nbasis: Number of basis functions to use - :type nbasis: integer - :param cutoff: Cutoff radius - :type cutoff: float - :param precision: Precision in the basis functions. A value of 2 corresponds to the \ - basis functions intersecting at half maximum. Higher values makes \ - the basis functions narrower. - :type precision: float - :param elements: Atomnumber of elements that the representation should support. - `elements='auto'` will try to determine this automatically. - :type elements: list - """ self._set_data(data) self.nbasis = nbasis @@ -712,9 +706,9 @@ def transform(self, X): representations = [] for charge, xyz, n in zip(nuclear_charges, coordinates, natoms): - representations.append( + representations.append(np.asarray( fgenerate_acsf(xyz, charge, self.elements, Rs, Rs, Ts, - eta, eta, zeta, self.cutoff, self.cutoff, n, size)) + eta, eta, zeta, self.cutoff, self.cutoff, n, size))) data._representations = np.asarray(representations) diff --git a/qml/utils/utils.py b/qml/utils/utils.py index 05b96cbd5..ce510c74c 100644 --- a/qml/utils/utils.py +++ b/qml/utils/utils.py @@ -24,10 +24,10 @@ import numpy as np def is_positive(x): - return (not is_array_like(x) and _is_numeric(x) and x > 0) + return (not is_array_like(x) and is_numeric(x) and x > 0) def is_positive_or_zero(x): - return (not is_array_like(x) and _is_numeric(x) and x >= 0) + return (not is_array_like(x) and is_numeric(x) and x >= 0) def is_array_like(x): return isinstance(x, (tuple, list, np.ndarray)) @@ -44,7 +44,7 @@ def is_string(x): def is_dict(x): return isinstance(x, dict) -def _is_numeric(x): +def is_numeric(x): return isinstance(x, (float, int)) def is_numeric_array(x): @@ -56,8 +56,17 @@ def is_numeric_array(x): return False return False +def is_numeric_1d_array(x): + return is_numeric_array(x) and is_1d_array(x) + +# Accepts 2d arrays of shape (n,1) and (1,n) as well +def is_1d_array(x): + return is_array_like(x) and (np.asarray(x).ndim == 1 or np.asarray(x).ndim == 2 and 1 in np.asarray(x).shape) + +# Doesn't accept floats e.g. 1.0 def _is_integer(x): - return (_is_numeric(x) and (float(x) == int(x))) + return isinstance(x, int) + #return (is_numeric(x) and (float(x) == int(x))) # will intentionally accept 0, 1 as well def is_bool(x): @@ -82,31 +91,15 @@ def _is_integer_array(x): return True return False +def is_positive_integer_1d_array(x): + return is_positive_integer_array(x) and is_1d_array(x) + def is_positive_integer_array(x): return (_is_integer_array(x) and _is_positive_array(x)) def is_positive_integer_or_zero_array(x): return (_is_integer_array(x) and _is_positive_or_zero_array(x)) -def get_unique(x): - """ - Gets all unique elements in lists of lists - """ - elements = list(set(item for l in x for item in l)) - return elements - -def get_pairs(x): - """ - Get all unique pairs. E.g. x = [1,2,3] will return - [[1, 1], [1, 2], [1, 3], [2, 2], [2, 3], [3, 3]] - """ - pairs = [] - for i,v in enumerate(x): - for w in x[i:]: - pairs.append([v,w]) - return pairs - - # ------------- ** Checking inputs ** -------------------------- def check_global_representation(x): @@ -248,7 +241,7 @@ def check_classes(classes): if not is_array_like(classes): raise InputError("classes should be array like.") - if not is_positive_integer_array(classes): + if not is_positive_integer_or_zero_array(classes): raise InputError("classes should be an array of ints.") classes = np.asarray(classes) @@ -259,84 +252,25 @@ def check_classes(classes): return approved_classes -# -#def _is_numeric_array(x): -# try: -# arr = np.asarray(x, dtype = float) -# return True -# except (ValueError, TypeError): -# return False -# -#def _is_numeric_scalar(x): -# try: -# float(x) -# return True -# except (ValueError, TypeError): -# return False -# -#def is_positive(x): -# if is_array(x) and _is_numeric_array(x): -# return _is_positive_scalar(x) -# -#def _is_positive_scalar(x): -# return float(x) > 0 -# -#def _is_positive_array(x): -# return np.asarray(x, dtype = float) > 0 -# -#def is_positive_or_zero(x): -# if is_numeric(x): -# if is_array(x): -# return is_positive_or_zero_array(x) -# else: -# return is_positive_or_zero_scalar(x) -# else: -# return False -# -#def is_positive_or_zero_array(x): -# -# -#def is_positive_or_zero_scalar(x): -# return float(x) >= 0 -# -#def is_integer(x): -# if is_array(x) -# return is_integer_array(x) -# else: -# return is_integer_scalar(x) -# -## will intentionally accept floats with integer values -#def is_integer_array(x): -# if is_numeric(x): -# return (np.asarray(x) == np.asarray(y)).all() -# else: -# return False -# -## will intentionally accept floats with integer values -#def is_integer_scalar(x): -# if is_numeric(x): -# return int(float(x)) == float(x) -# else: -# return False -# -# -#def is_string(x): -# return isinstance(x, str) -# -#def is_positive_integer(x): -# return (is_numeric(x) and is_integer(x) and is_positive(x)) -# -#def is_positive_integer_or_zero(x): -# return (is_numeric(x) and is_integer(x) and is_positive_or_zero(x)) -# -#def is_negative_integer(x): -# if is_integer(x): -# return not is_positive(x) -# else: -# return False -# -#def is_non_zero_integer(x): -# return (is_positive_integer(x) or is_negative_integer(x)) +# ------------ ** Utility functions ** ---------------- + +def get_unique(x): + """ + Gets all unique elements in lists of lists + """ + elements = list(set(item for l in x for item in l)) + return sorted(elements) + +def get_pairs(x): + """ + Get all unique pairs. E.g. x = [1,2,3] will return + [[1, 1], [1, 2], [1, 3], [2, 2], [2, 3], [3, 3]] + """ + pairs = [] + for i,v in enumerate(x): + for w in x[i:]: + pairs.append([v,w]) + return pairs # Custom exception to raise when we intentinoally catch an error @@ -355,3 +289,17 @@ def ceil(a, b): """ return -(-a//b) + +def get_batch_size(batch_size, n_samples): + + if batch_size > n_samples: + print("Warning: batch_size larger than sample size. It is going to be clipped") + return min(n_samples, batch_size) + + # see if the batch size can be modified slightly to make sure the last batch is similar in size + # to the rest of the batches + # This is always less that the requested batch size, so no memory issues should arise + + better_batch_size = ceil(n_samples, ceil(n_samples, batch_size)) + return better_batch_size + diff --git a/setup.py b/setup.py index 0f30de11f..c33bbfde6 100755 --- a/setup.py +++ b/setup.py @@ -21,6 +21,7 @@ COMPILER_FLAGS = ["-O3", "-fopenmp", "-m64", "-march=native", "-fPIC", "-Wno-maybe-uninitialized", "-Wno-unused-function", "-Wno-cpp"] LINKER_FLAGS = ["-lgomp"] +# MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] MATH_LINKER_FLAGS = ["-lblas", "-llapack"] # UNCOMMENT TO FORCE LINKING TO MKL with GNU compilers: @@ -32,6 +33,7 @@ if sys.platform == "darwin" and all(["gnu" not in arg for arg in sys.argv]): COMPILER_FLAGS = ["-O3", "-m64", "-march=native", "-fPIC"] LINKER_FLAGS = [] + # MATH_LINKER_FLAGS = ["-lblas", "-llapack", "-latlas", "-fopenmp"] MATH_LINKER_FLAGS = ["-lblas", "-llapack"] diff --git a/test/test_acsf.py b/test/test_acsf.py new file mode 100644 index 000000000..fb29dd006 --- /dev/null +++ b/test/test_acsf.py @@ -0,0 +1,136 @@ +# MIT License +# +# Copyright (c) 2018 Silvia Amabilino +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +""" +This file contains tests for the tensorflow atom centred symmetry function module. It uses the numpy implementation +as a comparison. +""" + +import tensorflow as tf +import numpy as np + +from qml.aglaia import symm_funct +from qml.aglaia import np_symm_funct +import os + + +def test_acsf_1(): + """ + This test compares the atom centred symmetry functions generated with tensorflow and numpy. + The test system consists of 5 configurations of CH4 + CN radical. + :return: None + """ + + test_dir = os.path.dirname(os.path.realpath(__file__)) + + nRs2 = 3 + nRs3 = 3 + nTs = 3 + rcut = 5 + acut = 5 + zeta = 220.127 + eta = 30.8065 + + input_data = test_dir + "/data/data_test_acsf.npz" + data = np.load(input_data) + + xyzs = data["arr_0"] + zs = data["arr_1"] + elements = data["arr_2"] + element_pairs = data["arr_3"] + + n_samples = xyzs.shape[0] + n_atoms = zs.shape[1] + + with tf.name_scope("Inputs"): + zs_tf = tf.placeholder(shape=[n_samples, n_atoms], dtype=tf.int32, name="zs") + xyz_tf = tf.placeholder(shape=[n_samples, n_atoms, 3], dtype=tf.float32, name="xyz") + + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + sess = tf.Session() + sess.run(tf.global_variables_initializer()) + acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) + + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + n_samples = xyzs.shape[0] + n_atoms = xyzs.shape[1] + + for i in range(n_samples): + for j in range(n_atoms): + acsf_np_sort = np.sort(acsf_np[i][j]) + acsf_tf_sort = np.sort(acsf_tf[i][j]) + np.testing.assert_array_almost_equal(acsf_np_sort, acsf_tf_sort, decimal=4) + +def test_acsf_2(): + """ + This test compares the atom centred symmetry functions generated with tensorflow and numpy. + The test system consists of 10 molecules from the QM7 data set. + :return: None + """ + test_dir = os.path.dirname(os.path.realpath(__file__)) + + nRs2 = 3 + nRs3 = 3 + nTs = 3 + rcut = 5 + acut = 5 + zeta = 220.127 + eta = 30.8065 + + input_data = test_dir + "/data/qm7_testdata.npz" + data = np.load(input_data) + + xyzs = data["arr_0"] + zs = data["arr_1"] + elements = data["arr_2"] + element_pairs = data["arr_3"] + + n_samples = xyzs.shape[0] + max_n_atoms = zs.shape[1] + + with tf.name_scope("Inputs"): + zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") + xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") + + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + sess = tf.Session() + sess.run(tf.global_variables_initializer()) + acsf_tf = sess.run(acsf_tf_t, feed_dict={xyz_tf: xyzs, zs_tf: zs}) + + acsf_np = np_symm_funct.generate_acsf_np(xyzs, zs, elements, element_pairs, rcut, acut, nRs2, nRs3, nTs, zeta, eta) + + for i in range(n_samples): + for j in range(max_n_atoms): + if zs[i][j] == 0: + continue + else: + acsf_np_sort = np.sort(acsf_np[i][j]) + acsf_tf_sort = np.sort(acsf_tf[i][j]) + np.testing.assert_array_almost_equal(acsf_np_sort, acsf_tf_sort, decimal=4) + + +if __name__ == "__main__": + test_acsf_1() + test_acsf_2() \ No newline at end of file diff --git a/test/test_armp.py b/test/test_armp.py index 4c6fb10ab..7cb184c0b 100644 --- a/test/test_armp.py +++ b/test/test_armp.py @@ -30,25 +30,26 @@ from qml.utils import InputError import glob import os +import shutil def test_set_representation(): """ This function tests the function _set_representation. """ try: - ARMP(representation='slatm', representation_params={'slatm_sigma12': 0.05}) + ARMP(representation_name='slatm', representation_params={'slatm_sigma12': 0.05}) raise Exception except InputError: pass try: - ARMP(representation='coulomb_matrix') + ARMP(representation_name='coulomb_matrix') raise Exception except InputError: pass try: - ARMP(representation='slatm', representation_params={'slatm_alchemy': 0.05}) + ARMP(representation_name='slatm', representation_params={'slatm_alchemy': 0.05}) raise Exception except InputError: pass @@ -56,7 +57,7 @@ def test_set_representation(): parameters = {'slatm_sigma1': 0.07, 'slatm_sigma2': 0.04, 'slatm_dgrid1': 0.02, 'slatm_dgrid2': 0.06, 'slatm_rcut': 5.0, 'slatm_rpower': 7, 'slatm_alchemy': True} - estimator = ARMP(representation='slatm', representation_params=parameters) + estimator = ARMP(representation_name='slatm', representation_params=parameters) assert estimator.representation_name == 'slatm' assert estimator.slatm_parameters == parameters @@ -71,7 +72,7 @@ def test_set_properties(): energies = np.loadtxt(test_dir + '/CN_isobutane/prop_kjmol_training.txt', usecols=[1]) - estimator = ARMP(representation='slatm') + estimator = ARMP(representation_name='slatm') assert estimator.properties == None @@ -119,7 +120,7 @@ def test_fit_1(): usecols=[1]) filenames.sort() - estimator = ARMP(representation="acsf") + estimator = ARMP(representation_name="acsf") estimator.generate_compounds(filenames[:50]) estimator.set_properties(energies[:50]) estimator.generate_representation() @@ -198,6 +199,103 @@ def test_predict_3(): assert energies.shape == energies_pred.shape +def test_predict_fromxyz(): + """ + This test checks that the predictions from the "predict" and the "predict_from_xyz" functions are the same. + It also checks that if the model is saved, when the model is reloaded the predictions are still the same. + """ + + xyz = np.array([[[0, 1, 0], [0, 1, 1], [1, 0, 1]], + [[1, 2, 2], [3, 1, 2], [1, 3, 4]], + [[4, 1, 2], [0.5, 5, 6], [-1, 2, 3]]]) + zs = np.array([[1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + + ene_true = np.array([0.5, 0.9, 1.0]) + + acsf_param = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} + estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', + representation_params=acsf_param) + + estimator.set_properties(ene_true) + estimator.generate_representation(xyz, zs) + + idx = list(range(xyz.shape[0])) + + estimator.fit(idx) + + pred1 = estimator.predict(idx) + pred2 = estimator.predict_from_xyz(xyz, zs) + + assert np.all(np.isclose(pred1, pred2, rtol=1.e-6)) + + estimator.save_nn(save_dir="temp") + + new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', + representation_params=acsf_param) + + new_estimator.load_nn(save_dir="temp") + + new_estimator.set_properties(ene_true) + new_estimator.generate_representation(xyz, zs) + + pred3 = new_estimator.predict(idx) + pred4 = new_estimator.predict_from_xyz(xyz, zs) + + assert np.all(np.isclose(pred3, pred4, rtol=1.e-6)) + assert np.all(np.isclose(pred1, pred3, rtol=1.e-6)) + + shutil.rmtree("temp") + +def test_retraining(): + xyz = np.array([[[0, 1, 0], [0, 1, 1], [1, 0, 1]], + [[1, 2, 2], [3, 1, 2], [1, 3, 4]], + [[4, 1, 2], [0.5, 5, 6], [-1, 2, 3]]]) + zs = np.array([[1, 2, 3], + [1, 2, 3], + [1, 2, 3]]) + + ene_true = np.array([0.5, 0.9, 1.0]) + + acsf_param = {"nRs2": 5, "nRs3": 5, "nTs": 5, "rcut": 5, "acut": 5, "zeta": 220.127, "eta": 30.8065} + estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', + representation_params=acsf_param) + + estimator.set_properties(ene_true) + estimator.generate_representation(xyz, zs) + + idx = list(range(xyz.shape[0])) + + estimator.fit(idx) + estimator.save_nn(save_dir="temp") + + pred1 = estimator.predict(idx) + + estimator.loaded_model = True + + estimator.fit(idx) + + pred2 = estimator.predict(idx) + + new_estimator = ARMP(iterations=10, l1_reg=0.0001, l2_reg=0.005, learning_rate=0.0005, representation_name='acsf', + representation_params=acsf_param) + new_estimator.set_properties(ene_true) + new_estimator.generate_representation(xyz, zs) + + new_estimator.load_nn("temp") + + pred3 = new_estimator.predict(idx) + + new_estimator.fit(idx) + + pred4 = new_estimator.predict(idx) + + assert np.all(np.isclose(pred1, pred3, rtol=1.e-6)) + assert np.all(np.isclose(pred2, pred4, rtol=1.e-6)) + + shutil.rmtree("temp") + if __name__ == "__main__": test_set_representation() test_set_properties() @@ -207,3 +305,5 @@ def test_predict_3(): test_fit_3() test_score_3() test_predict_3() + test_predict_fromxyz() + test_retraining() \ No newline at end of file diff --git a/test/test_fchl_electric_field.py b/test/test_fchl_electric_field.py index 1d896aa4e..504fa0f97 100644 --- a/test/test_fchl_electric_field.py +++ b/test/test_fchl_electric_field.py @@ -235,14 +235,15 @@ def test_multiple_operators(): def test_generate_representation(): - + test_dir = os.path.dirname(os.path.realpath(__file__)) + coords = np.array([[1.464, 0.707, 1.056], [0.878, 1.218, 0.498], [2.319, 1.126, 0.952]]) nuclear_charges = np.array([8, 1, 1], dtype=np.int32) - rep_ref = np.loadtxt("data/fchl_ef_rep.txt").reshape((3, 6, 3)) + rep_ref = np.loadtxt(test_dir + "/data/fchl_ef_rep.txt").reshape((3, 6, 3)) # Test with fictitious charges from a numpy array fic_charges1 = np.array([-0.41046649, 0.20523324, 0.20523324]) diff --git a/test/test_fchl_force.py b/test/test_fchl_force.py index 0fb74b1b2..5782f4f99 100644 --- a/test/test_fchl_force.py +++ b/test/test_fchl_force.py @@ -2,6 +2,7 @@ import ast import time +import os import scipy import scipy.stats @@ -31,9 +32,11 @@ from qml.fchl import get_atomic_local_gradient_5point_kernels from qml.fchl import get_atomic_local_kernels +test_dir = os.path.dirname(os.path.realpath(__file__)) + FORCE_KEY = "forces" ENERGY_KEY = "om2_energy" -CSV_FILE = "data/amons_small.csv" +CSV_FILE = test_dir + "/data/amons_small.csv" SIGMAS = [0.64] TRAINING = 13 diff --git a/test/test_mrmp.py b/test/test_mrmp.py index 1f98d7602..2a54c3aec 100644 --- a/test/test_mrmp.py +++ b/test/test_mrmp.py @@ -31,25 +31,30 @@ import glob import os import shutil +try: + import tensorflow as tf +except ImportError: + print("Tensorflow not found but is needed for mrmp class.") + raise SystemExit def test_set_representation(): """ This function tests the method MRMP._set_representation. """ try: - MRMP(representation='unsorted_coulomb_matrix', representation_params={'slatm_sigma1': 0.05}) + MRMP(representation_name='unsorted_coulomb_matrix', representation_params={'slatm_sigma1': 0.05}) raise Exception except InputError: pass try: - MRMP(representation='coulomb_matrix') + MRMP(representation_name='coulomb_matrix') raise Exception except InputError: pass try: - MRMP(representation='slatm', representation_params={'slatm_alchemy': 0.05}) + MRMP(representation_name='slatm', representation_params={'slatm_alchemy': 0.05}) raise Exception except InputError: pass @@ -57,7 +62,7 @@ def test_set_representation(): parameters ={'slatm_sigma1': 0.07, 'slatm_sigma2': 0.04, 'slatm_dgrid1': 0.02, 'slatm_dgrid2': 0.06, 'slatm_rcut': 5.0, 'slatm_rpower': 7, 'slatm_alchemy': True} - estimator = MRMP(representation='slatm', representation_params=parameters) + estimator = MRMP(representation_name='slatm', representation_params=parameters) assert estimator.representation_name == 'slatm' assert estimator.slatm_parameters == parameters @@ -71,7 +76,7 @@ def test_set_properties(): energies = np.loadtxt(test_dir + '/CN_isobutane/prop_kjmol_training.txt', usecols=[1]) - estimator = MRMP(representation='unsorted_coulomb_matrix') + estimator = MRMP(representation_name='unsorted_coulomb_matrix') assert estimator.properties == None @@ -123,7 +128,7 @@ def test_fit_1(): available_representations = ['sorted_coulomb_matrix', 'unsorted_coulomb_matrix', 'bag_of_bonds', 'slatm'] for rep in available_representations: - estimator = MRMP(representation=rep) + estimator = MRMP(representation_name=rep) estimator.generate_compounds(filenames[:100]) estimator.set_properties(energies[:100]) estimator.generate_representation() @@ -214,13 +219,16 @@ def test_load_external(): This function tests if a model that has been trained on a different computer can be loaded and used on a different computer. """ + tf.reset_default_graph() + + test_dir = os.path.dirname(os.path.realpath(__file__)) x = np.linspace(-10.0, 10.0, 2000) y = x ** 2 x = np.reshape(x, (x.shape[0], 1)) estimator = MRMP() - estimator.load_nn("saved_model") + estimator.load_nn(test_dir + "/saved_model") score_after_loading = estimator.score(x, y) score_on_other_machine = -24.101043 diff --git a/test/test_neural_network.py b/test/test_neural_network.py index 21ec5318b..798b076ac 100644 --- a/test/test_neural_network.py +++ b/test/test_neural_network.py @@ -107,7 +107,6 @@ def catch(s): # This should not raise an exception C(batch_size = 2) - C(batch_size = 2.0) C(batch_size = "auto") # This should be caught @@ -116,6 +115,7 @@ def catch(s): catch("x") catch(4.2) catch(None) + catch(2.0) def learning_rate(C): # Exceptions that are supposed to be caught @@ -146,13 +146,13 @@ def catch(s): # This should not raise an exception C(iterations = 1) - C(iterations = 1.0) # This should be caught catch(-2) catch("x") catch(4.2) catch(None) + catch(1.0) def tf_dtype(C): # Exceptions that are supposed to be caught @@ -193,7 +193,6 @@ def catch(s): # This should not raise an exception C(hl1 = 1) - C(hl1 = 1.0) # This should be caught catch(0) @@ -201,6 +200,7 @@ def catch(s): catch(4.2) catch(None) catch(-1) + catch(1.0) def hl2(C): # Exceptions that are supposed to be caught @@ -213,7 +213,6 @@ def catch(s): # This should not raise an exception C(hl2 = 1) - C(hl2 = 1.0) C(hl2 = 0) # This should be caught @@ -221,6 +220,7 @@ def catch(s): catch(4.2) catch(None) catch(-1) + catch(1.0) def hl3(C): # Exceptions that are supposed to be caught @@ -233,7 +233,6 @@ def catch(s): # This should not raise an exception C(hl2 = 2, hl3 = 1) - C(hl2 = 2, hl3 = 1.0) C(hl2 = 2, hl3 = 0) # This should be caught @@ -241,6 +240,7 @@ def catch(s): catch(4.2) catch(None) catch(-1) + catch(1.0) def representation(C): # Exceptions that are supposed to be caught diff --git a/test/test_symm_funct.py b/test/test_symm_funct.py index 6fa92c7ce..5f13bb201 100644 --- a/test/test_symm_funct.py +++ b/test/test_symm_funct.py @@ -106,9 +106,9 @@ def fort_acsf(mols, path, elements): def tf_acsf(mols, path, elements): radial_cutoff = 5 angular_cutoff = 5 - radial_rs = np.linspace(0, radial_cutoff, 3) - angular_rs = np.linspace(0, angular_cutoff, 3) - theta_s = np.linspace(0, np.pi, 3) + n_radial_rs = 3 + n_angular_rs = 3 + n_theta_s = 3 zeta = 1.0 eta = 1.0 @@ -128,7 +128,7 @@ def tf_acsf(mols, path, elements): zs_tf = tf.placeholder(shape=[n_samples, max_n_atoms], dtype=tf.int32, name="zs") xyz_tf = tf.placeholder(shape=[n_samples, max_n_atoms, 3], dtype=tf.float32, name="xyz") - acsf_tf_t = symm_funct.generate_parkhill_acsf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, radial_rs, angular_rs, theta_s, zeta, eta) + acsf_tf_t = symm_funct.generate_acsf_tf(xyz_tf, zs_tf, elements, element_pairs, radial_cutoff, angular_cutoff, n_radial_rs, n_angular_rs, n_theta_s, zeta, eta) sess = tf.Session() sess.run(tf.global_variables_initializer())