Add script to download the expolanets dataset for the dataprep task

* Python script to download all the columns needed * Update task file. Use strings instead of std::filesystem::path, so that relative paths and tokens can be used
2026-01-06 11:39:49 -06:00 · 2021-11-15 13:44:57 +01:00
parent 109be85d48
commit d7565f65ee
3 changed files with 49 additions and 10 deletions
--- a/data/tasks/exoplanets/datapreparation.task
+++ b/data/tasks/exoplanets/datapreparation.task
@@ -1,11 +1,11 @@
-local dataFolder = "D:/dev/exoplanets data config"
+local dataFolder = "D:/data/prepared_exoplanets_data"
 return {
  {
    Type = "ExoplanetsDataPreparationTask",

-    InputDataFile = dataFolder .. "/exoplanets_data_composite.csv",
-    InputSPECK = "${SYNC}/http/digitaluniverse_exoplanets_speck/1/expl.speck",
-    TeffToBvFile = "${SYNC}/http/exoplanets_data/1/teff_bv.txt",
+    InputDataFile = "${DATA}/tasks/exoplanets/downloaded_exo_data.csv",
+    InputSPECK = "${SYNC}/http/digitaluniverse_exoplanets_speck/2/expl.speck",
+    TeffToBvFile = "${SYNC}/http/exoplanets_data/2/teff_bv.txt",
    OutputBIN = dataFolder .. "/exoplanets_data.bin",
    OutputLUT = dataFolder .. "/lookup.txt"
  }
--- a/data/tasks/exoplanets/downloadexodata.py
+++ b/data/tasks/exoplanets/downloadexodata.py
@@ -0,0 +1,39 @@
+##
+# Download most recent exoplanet data from NASA Exoplanet Archive using the TAP service
+# More info at: https://exoplanetarchive.ipac.caltech.edu/docs/TAP/usingTAP.html 
+#
+# The data table is the Planetary Systems Composite dataset, where multiple sources have 
+# been combined into one row per planet. 
+# https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=PSCompPars
+#
+# The script downloads the columns needed for the visualization in OpenSpace and for the 
+# exoplanets datapreparation task, but more columns can be added if needed. 
+##
+
+import pandas as pd
+
+dataFileName = 'downloaded_exo_data.csv'
+
+# The columns we need for the visualization in OpenSpace
+columns = 'pl_name,hostname,pl_letter,sy_snum,sy_pnum,pl_orbsmax,pl_orbsmaxerr1,pl_orbsmaxerr2,' \
+          'pl_orbeccen,pl_orbeccenerr1,pl_orbeccenerr2,pl_orbincl,pl_orbinclerr1,pl_orbinclerr2,' \
+          'pl_orblper,pl_orblpererr1,pl_orblpererr2,pl_orbper,pl_orbpererr1,pl_orbpererr2,' \
+          'pl_radj,pl_radjerr1,pl_radjerr2,pl_tranmid,pl_tranmiderr1,pl_tranmiderr2,ra,dec,' \
+          'sy_dist,st_rad,st_raderr1,st_raderr2,st_teff,st_tefferr1,st_tefferr2,' \
+          'st_lum,st_lumerr1,st_lumerr2,cb_flag,disc_year'
+
+# This may contain any extra conditions that one might want to fulfill. Start with a '+' sign
+where = ''
+
+###
+## Download and save csv file
+print("Downloading all confirmed planets from NExSci's Exoplanets Archive... (Planetary Systems Composite Data table)")
+
+NEW_API = 'https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query='
+url = NEW_API + 'select+' + columns + '+from+pscomppars' + where + '&format=csv'
+print(url)
+df = pd.read_csv(url)
+
+print("Writing data to file...")
+df.to_csv(dataFileName)
+print("Done!")