#define MICHLIB_NOSOURCE #include "COPERNICUS.h" #include "mirrorfuncs.h" #include #include using michlib::GPL; const MString COPERNICUSData::caturl = "https://stac.marine.copernicus.eu/metadata/catalog.stac.json"; std::pair COPERNICUSData::GetJSON(const MString& url) { Json::Reader reader; Json::Value obj; MString content; auto [val, suc] = cache->Get(url); if(suc) content = std::move(val); else { michlib::message(url + " not found in cache, downloading"); auto [out, res] = GetUrl(chandle, url); if(res != CURLE_OK) return {obj, MString("Can't download JSON: ") + curlerr}; cache->Put(url, out, 3600); content = std::move(out); } reader.parse(content.Buf(), content.Buf() + content.Len(), obj, false); return {obj, ""}; } MString COPERNICUSData::ReadURL(const Json::Value& cat, const MString& prod) { const auto& links = cat["links"]; if(links.type() != Json::arrayValue) return ""; for(Json::ArrayIndex i = 0; i < links.size(); i++) { const auto& titl = links[i]["title"]; const auto& href = links[i]["href"]; if(titl.type() == Json::stringValue && href.type() == Json::stringValue) { MString str(titl.asString().c_str()); if(str == prod) return MString(href.asString().c_str()); } } return ""; } std::pair, MString> COPERNICUSData::ReadRemoteFileList(const MString& url) { LIBXML_TEST_VERSION std::vector out; MString bucket, prefix; // Split url on prefix and bucket { size_t pos = url.Len(); size_t count = 0; for(size_t i = 0; i < url.Len(); i++) { if(url[i] == '/') count++; if(count == 4) { pos = i; break; } } if(pos == url.Len()) return {out, "Can't parse url: " + url}; bucket = url.SubStr(1, pos); prefix = url.SubStr(pos + 2, url.Len() - pos - 1); } MString cont; bool next = true; while(next) { MString url = bucket + "?list-type=2&prefix=" + prefix; if(cont.Exist()) url += "&continuation-token=" + cont; cont = ""; auto [data, res] = GetUrl(chandle, url); if(res != CURLE_OK) return {out, MString("Can't download ") + url + ": " + curlerr}; xmlDocPtr doc = xmlReadMemory(data.Buf(), data.Len(), "data.xml", nullptr, 0); if(doc == nullptr) return {out, MString("Can't download ") + url + ": XML parse error"}; auto cur = xmlDocGetRootElement(doc); if(cur == nullptr) { xmlFreeDoc(doc); return {out, MString("Can't download ") + url + ": empty XML"}; } if(xmlStrEqual(cur->name, (const xmlChar*)"ListBucketResult") == 0) { xmlFreeDoc(doc); return {out, MString("Can't download ") + url + ": unknown XML"}; } for(const auto* n = cur->children; n; n = n->next) { if(xmlStrEqual(n->name, (const xmlChar*)"NextContinuationToken") == 1) { auto* content = xmlNodeGetContent(n); cont = (char*)content; xmlFree(content); } if(xmlStrEqual(n->name, (const xmlChar*)"Contents") == 1) { MString fname; MDateTime mtime; size_t size = 0; for(const auto* c = n->children; c; c = c->next) { if(xmlStrEqual(c->name, (const xmlChar*)"Key") == 1) { auto* content = xmlNodeGetContent(c); fname = (char*)content; xmlFree(content); } if(xmlStrEqual(c->name, (const xmlChar*)"LastModified") == 1) { auto* content = xmlNodeGetContent(c); mtime.FromString((char*)content); xmlFree(content); } if(xmlStrEqual(c->name, (const xmlChar*)"Size") == 1) { auto* content = xmlNodeGetContent(c); size = MString((char*)content).ToInteger(); xmlFree(content); } } out.emplace_back(bucket + "/" + fname, fname.SubStr(prefix.Len() + 2, fname.Len() - prefix.Len() - 1), mtime, size); } } xmlFreeDoc(doc); next = cont.Exist(); } std::sort(out.begin(), out.end(), [](const struct FileInfo& a, const struct FileInfo& b) { return a.name < b.name; }); return {out, ""}; } MString COPERNICUSData::Mirror(const CLArgs& args) { GPL.UsePrefix("COPERNICUS"); // Local directory MString mirrorroot = GPL.ParameterSValue("MirrorTo", ""); if(!mirrorroot.Exist()) return "Local mirror directory not specified"; // Cache cache.reset(CreateCache(GPL.ParameterSValue("Cache", ""))); if(!cache) { michlib::errmessage("Can't init cache"); cache.reset(new FakeCache); } curl_easy_setopt(chandle, CURLOPT_ERRORBUFFER, curlerr); if(!args.contains("product")) return "Copernicus product not specified"; MString prod = args.at("product"); Json::Value product; MString produrl; // Get catalog { auto [cat, err] = GetJSON(caturl); if(err.Exist()) return "Can't download catalog: " + err; if(cat["title"].type() != Json::stringValue || cat["title"].asString() != "Copernicus Marine Data Store") return "Can't parse catalog"; catalog = std::move(cat); } // Get product { auto url = ReadURL(catalog, prod); if(!url.Exist()) return "Url for product " + prod + " not found in catalog"; produrl = DirName(caturl) + "/" + url; auto [pr, err] = GetJSON(produrl); if(err.Exist()) return "Can't download product information from " + produrl + ": " + err; product = std::move(pr); } std::vector dsets; if(args.contains("dataset")) dsets.push_back(args.at("dataset")); else { const auto& links = product["links"]; if(links.type() != Json::arrayValue) return "Can't find information about datasets"; for(Json::ArrayIndex i = 0; i < links.size(); i++) { const auto& rel = links[i]["rel"]; const auto& titl = links[i]["title"]; if(rel.type() == Json::stringValue && titl.type() == Json::stringValue && rel.asString() == "item") dsets.push_back(titl.asString().c_str()); } } for(const auto& dset: dsets) { michlib::message("Mirroring " + dset); auto url = ReadURL(product, dset); if(!url.Exist()) return "Url for dataset " + dset + " not found in product description"; MString dseturl = DirName(produrl) + "/" + url; auto [ds, err] = GetJSON(dseturl); if(err.Exist()) return "Can't download dataset information from " + dseturl + ": " + err; const auto& href = ds["assets"]["native"]["href"]; if(href.type() != Json::stringValue) return "Can't find data for dataset " + dset + " from product " + prod; url = href.asString().c_str(); MString locroot = mirrorroot + "/" + prod + "/" + dset; auto [lfiles, lerr] = ReadLocalFileList(locroot); if(lerr.Exist()) return lerr; auto [rfiles, rerr] = ReadRemoteFileList(url); if(rerr.Exist()) return rerr; std::vector down, rem; std::vector> upd; { size_t rpos = 0, lpos = 0; while(rpos != rfiles.size() || lpos != lfiles.size()) { if(rpos == rfiles.size()) while(lpos != lfiles.size()) rem.push_back(lpos++); if(lpos == lfiles.size()) while(rpos != rfiles.size()) down.push_back(rpos++); if(rpos == rfiles.size() || lpos == lfiles.size()) continue; if(rfiles[rpos].name < lfiles[lpos].name) down.push_back(rpos++); else if(lfiles[lpos].name < rfiles[rpos].name) rem.push_back(lpos++); else { auto delta = rfiles[rpos].mtime.Epoch() - lfiles[lpos].mtime.Epoch(); if(delta < 0) delta = -delta; if(delta > 0 || rfiles[rpos].size != lfiles[lpos].size) upd.emplace_back(rpos, lpos); lpos++; rpos++; } } } michlib::message(MString("New files: ") + down.size()); michlib::message(MString("Obsolete files: ") + rem.size()); michlib::message(MString("Modified files: ") + upd.size()); for(size_t i = 0; i < down.size(); i++) { size_t ri = down[i]; auto err = DownloadFile(chandle, rfiles[ri], locroot); if(err.Exist()) return err; } for(size_t i = 0; i < rem.size(); i++) { size_t li = rem[i]; auto err = RemoveFile(lfiles[li]); if(err.Exist()) return err; } for(size_t i = 0; i < upd.size(); i++) { size_t ri = upd[i].first; size_t li = upd[i].second; auto err = UpdateFile(chandle, rfiles[ri], lfiles[li], locroot); if(err.Exist()) return err; } } return ""; }