odm/sources/COPERNICUS.cpp

#define MICHLIB_NOSOURCE
#include "COPERNICUS.h"
#include "mirrorfuncs.h"
#include <libxml/parser.h>
#include <libxml/tree.h>

using michlib::GPL;

const MString COPERNICUSData::caturl = "https://stac.marine.copernicus.eu/metadata/catalog.stac.json";

std::pair<Json::Value, MString> COPERNICUSData::GetJSON(const MString& url)
{
 Json::Reader reader;
 Json::Value  obj;
 MString      content;

 auto [val, suc] = cache->Get(url);
 if(suc)
  content = std::move(val);
 else
 {
  michlib::message(url + " not found in cache, downloading");
  auto [out, res] = GetUrl(chandle, url);
  if(res != CURLE_OK) return {obj, MString("Can't download JSON: ") + curlerr};
  cache->Put(url, out, 3600);
  content = std::move(out);
 }

 reader.parse(content.Buf(), content.Buf() + content.Len(), obj, false);

 return {obj, ""};
}

MString COPERNICUSData::ReadURL(const Json::Value& cat, const MString& prod)
{
 const auto& links = cat["links"];
 if(links.type() != Json::arrayValue) return "";
 for(Json::ArrayIndex i = 0; i < links.size(); i++)
 {
  const auto& titl = links[i]["title"];
  const auto& href = links[i]["href"];
  if(titl.type() == Json::stringValue && href.type() == Json::stringValue)
  {
   MString str(titl.asString().c_str());
   if(str == prod) return MString(href.asString().c_str());
  }
 }
 return "";
}

std::pair<std::vector<struct FileInfo>, MString> COPERNICUSData::ReadRemoteFileList(const MString& url)
{
 LIBXML_TEST_VERSION

 std::vector<struct FileInfo> out;
 MString                      bucket, prefix;

 // Split url on prefix and bucket
 {
  size_t pos   = url.Len();
  size_t count = 0;
  for(size_t i = 0; i < url.Len(); i++)
  {
   if(url[i] == '/') count++;
   if(count == 4)
   {
    pos = i;
    break;
   }
  }
  if(pos == url.Len()) return {out, "Can't parse url: " + url};

  bucket = url.SubStr(1, pos);
  prefix = url.SubStr(pos + 2, url.Len() - pos - 1);
 }

 MString cont;
 bool    next = true;

 while(next)
 {
  MString url = bucket + "?list-type=2&prefix=" + prefix;
  if(cont.Exist()) url += "&continuation-token=" + cont;
  cont = "";

  auto [data, res] = GetUrl(chandle, url);
  if(res != CURLE_OK) return {out, MString("Can't download ") + url + ": " + curlerr};

  xmlDocPtr doc = xmlReadMemory(data.Buf(), data.Len(), "data.xml", nullptr, 0);
  if(doc == nullptr) return {out, MString("Can't download ") + url + ": XML parse error"};
  auto cur = xmlDocGetRootElement(doc);
  if(cur == nullptr)
  {
   xmlFreeDoc(doc);
   return {out, MString("Can't download ") + url + ": empty XML"};
  }
  if(xmlStrEqual(cur->name, (const xmlChar*)"ListBucketResult") == 0)
  {
   xmlFreeDoc(doc);
   return {out, MString("Can't download ") + url + ": unknown XML"};
  }

  for(const auto* n = cur->children; n; n = n->next)
  {
   if(xmlStrEqual(n->name, (const xmlChar*)"NextContinuationToken") == 1)
   {
    auto* content = xmlNodeGetContent(n);
    cont          = (char*)content;
    xmlFree(content);
   }
   if(xmlStrEqual(n->name, (const xmlChar*)"Contents") == 1)
   {
    MString   fname;
    MDateTime mtime;
    size_t    size = 0;
    for(const auto* c = n->children; c; c = c->next)
    {
     if(xmlStrEqual(c->name, (const xmlChar*)"Key") == 1)
     {
      auto* content = xmlNodeGetContent(c);
      fname         = (char*)content;
      xmlFree(content);
     }
     if(xmlStrEqual(c->name, (const xmlChar*)"LastModified") == 1)
     {
      auto* content = xmlNodeGetContent(c);
      mtime.FromString((char*)content);
      xmlFree(content);
     }
     if(xmlStrEqual(c->name, (const xmlChar*)"Size") == 1)
     {
      auto* content = xmlNodeGetContent(c);
      size          = MString((char*)content).ToInteger<size_t>();
      xmlFree(content);
     }
    }
    out.emplace_back(bucket + "/" + fname, fname.SubStr(prefix.Len() + 2, fname.Len() - prefix.Len() - 1), mtime, size);
   }
  }
  xmlFreeDoc(doc);
  next = cont.Exist();
 }

 std::sort(out.begin(), out.end(), [](const struct FileInfo& a, const struct FileInfo& b) { return a.name < b.name; });
 return {out, ""};
}

MString COPERNICUSData::Mirror(const CLArgs& args)
{
 GPL.UsePrefix("COPERNICUS");

 // Local directory
 MString mirrorroot = GPL.ParameterSValue("MirrorTo", "");
 if(!mirrorroot.Exist()) return "Local mirror directory not specified";

 // Cache
 cache.reset(CreateCache(GPL.ParameterSValue("Cache", "")));
 if(!cache)
 {
  michlib::errmessage("Can't init cache");
  cache.reset(new FakeCache);
 }

 curl_easy_setopt(chandle, CURLOPT_ERRORBUFFER, curlerr);

 if(!args.contains("product")) return "Copernicus product not specified";
 MString     prod = args.at("product");
 Json::Value product;
 MString     produrl;

 // Get catalog
 {
  auto [cat, err] = GetJSON(caturl);
  if(err.Exist()) return "Can't download catalog: " + err;
  if(cat["title"].type() != Json::stringValue || cat["title"].asString() != "Copernicus Marine Data Store") return "Can't parse catalog";
  catalog = std::move(cat);
 }

 // Get product
 {
  auto url = ReadURL(catalog, prod);
  if(!url.Exist()) return "Url for product " + prod + " not found in catalog";
  produrl        = DirName(caturl) + "/" + url;
  auto [pr, err] = GetJSON(produrl);
  if(err.Exist()) return "Can't download product information from " + produrl + ": " + err;
  product = std::move(pr);
 }

 std::vector<MString> dsets;
 if(args.contains("dataset"))
  dsets.push_back(args.at("dataset"));
 else
 {
  const auto& links = product["links"];
  if(links.type() != Json::arrayValue) return "Can't find information about datasets";
  for(Json::ArrayIndex i = 0; i < links.size(); i++)
  {
   const auto& rel  = links[i]["rel"];
   const auto& titl = links[i]["title"];
   if(rel.type() == Json::stringValue && titl.type() == Json::stringValue && rel.asString() == "item") dsets.push_back(titl.asString().c_str());
  }
 }

 for(const auto& dset: dsets)
 {
  michlib::message("Mirroring " + dset);
  auto url = ReadURL(product, dset);
  if(!url.Exist()) return "Url for dataset " + dset + " not found in product description";
  MString dseturl = DirName(produrl) + "/" + url;
  auto [ds, err]  = GetJSON(dseturl);
  if(err.Exist()) return "Can't download dataset information from " + dseturl + ": " + err;

  const auto& href = ds["assets"]["native"]["href"];
  if(href.type() != Json::stringValue) return "Can't find data for dataset " + dset + " from product " + prod;

  url = href.asString().c_str();

  MString locroot = mirrorroot + "/" + prod + "/" + dset;

  auto [lfiles, lerr] = ReadLocalFileList(locroot);
  if(lerr.Exist()) return lerr;

  auto [rfiles, rerr] = ReadRemoteFileList(url);
  if(rerr.Exist()) return rerr;

  std::vector<size_t>                    down, rem;
  std::vector<std::pair<size_t, size_t>> upd;

  {
   size_t rpos = 0, lpos = 0;
   while(rpos != rfiles.size() || lpos != lfiles.size())
   {
    if(rpos == rfiles.size())
     while(lpos != lfiles.size()) rem.push_back(lpos++);
    if(lpos == lfiles.size())
     while(rpos != rfiles.size()) down.push_back(rpos++);
    if(rpos == rfiles.size() || lpos == lfiles.size()) continue;

    if(rfiles[rpos].name < lfiles[lpos].name)
     down.push_back(rpos++);
    else if(lfiles[lpos].name < rfiles[rpos].name)
     rem.push_back(lpos++);
    else
    {
     auto delta = rfiles[rpos].mtime.Epoch() - lfiles[lpos].mtime.Epoch();
     if(delta < 0) delta = -delta;
     if(delta > 0 || rfiles[rpos].size != lfiles[lpos].size) upd.emplace_back(rpos, lpos);
     lpos++;
     rpos++;
    }
   }
  }

  michlib::message(MString("New files: ") + down.size());
  michlib::message(MString("Obsolete files: ") + rem.size());
  michlib::message(MString("Modified files: ") + upd.size());

  for(size_t i = 0; i < down.size(); i++)
  {
   size_t ri  = down[i];
   auto   err = DownloadFile(chandle, rfiles[ri], locroot);
   if(err.Exist()) return err;
  }

  for(size_t i = 0; i < rem.size(); i++)
  {
   size_t li  = rem[i];
   auto   err = RemoveFile(lfiles[li]);
   if(err.Exist()) return err;
  }

  for(size_t i = 0; i < upd.size(); i++)
  {
   size_t ri  = upd[i].first;
   size_t li  = upd[i].second;
   auto   err = UpdateFile(chandle, rfiles[ri], lfiles[li], locroot);
   if(err.Exist()) return err;
  }
 }

 return "";
}