support archive.org protected urls

This commit is contained in:
Chee Yee
2025-01-11 21:25:55 -08:00
parent 3bcf136d2a
commit b7fe46cb94
6 changed files with 1475 additions and 587 deletions
+80 -16
View File
@@ -15,45 +15,102 @@ using httplib::Client;
using httplib::Headers;
using httplib::Result;
struct InsensitiveCompare
{
bool operator()(const std::string &a, const std::string &b) const
{
return strcasecmp(a.c_str(), b.c_str()) < 0;
}
};
static std::map<std::string, int> month_map = {{"Jan", 1}, {"Feb", 2}, {"Mar", 3}, {"Apr", 4}, {"May", 5}, {"Jun", 6}, {"Jul", 7}, {"Aug", 8}, {"Sep", 9}, {"Oct", 10}, {"Nov", 11}, {"Dec", 12}};
static std::set<std::string, InsensitiveCompare> ignore_cookie_keys = {"path", "expires", "max-age", "domain", "secure"};
std::string ArchiveOrgClient::GenerateRandomId(const int len)
{
static const char alphanum[] = "0123456789abcdef";
std::string tmp_s;
tmp_s.reserve(len);
for (int i = 0; i < len; ++i) {
tmp_s += alphanum[rand() % (sizeof(alphanum) - 1)];
}
return tmp_s;
}
int ArchiveOrgClient::Connect(const std::string &url, const std::string &username, const std::string &password)
{
int ret = BaseClient::Connect(url, username, password);
if (ret)
this->host_url = url;
size_t scheme_pos = url.find("://");
size_t root_pos = url.find("/", scheme_pos + 3);
if (root_pos != std::string::npos)
{
return Login(username, password);
this->host_url = url.substr(0, root_pos);
this->base_path = url.substr(root_pos);
}
client = new httplib::Client(this->host_url);
client->set_keep_alive(true);
client->set_follow_location(true);
client->set_connection_timeout(30);
client->set_read_timeout(30);
client->enable_server_certificate_verification(false);
this->cookies = {
{"donation-identifier", GenerateRandomId(32)},
{"test-cookie", "1"},
{"abtest-identifier", GenerateRandomId(32)}
};
if (username.length() > 0)
return Login(username, password);
else if (Ping())
this->connected = true;
return 1;
}
int ArchiveOrgClient::Login(const std::string &username, const std::string &password)
{
std::string url = std::string("/account/login");
std::string post_data = std::string("username=") + username +
"&password=" + password +
"&remember=true" +
"&referer=https://archive.org/" +
"&login=true" +
"&submit_by_js=true";
Headers headers = {{ "User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0"}};
SetCookies(headers);
if (auto res = client->Post(url, post_data.c_str(), post_data.length(), "application/x-www-form-urlencoded"))
MultipartFormDataItems items = {
{"username", username, "", ""},
{"password", password, "", ""},
{"remember", "true", "", ""},
{"referer", "https://archive.org/", "", ""},
{"login", "true", "", ""},
{"submit_by_js", "true", "", ""}};
if (auto res = client->Post(url, headers, items))
{
if (HTTP_SUCCESS(res->status))
{
if (res->has_header("set-cookie"))
if (res->has_header("Set-Cookie"))
{
int cookies_count = res->get_header_value_count("set-cookie");
for (int i=0; i < cookies_count; i++)
int cookies_count = res->get_header_value_count("Set-Cookie");
for (int i = 0; i < cookies_count; i++)
{
std::string cookie_str = res->get_header_value("set-cookie", i);
std::string cookie_str = res->get_header_value("Set-Cookie", i);
std::vector<std::string> cookies = Util::Split(cookie_str, ";");
for (std::vector<std::string>::iterator it = cookies.begin(); it != cookies.end();)
{
std::vector<std::string> cookie = Util::Split(*it, "=");
this->cookies[Util::Trim(cookie[0], " ")] = Util::Trim(cookie[1], " ");
std::string key = Util::Trim(cookie[0], " ");
if (ignore_cookie_keys.find(key) == ignore_cookie_keys.end())
{
if (cookie.size() > 1)
this->cookies[key] = Util::Trim(cookie[1], " ");
else
this->cookies[key] = "";
}
++it;
}
}
this->connected = true;
return 1;
}
else
@@ -187,6 +244,13 @@ std::vector<DirEntry> ArchiveOrgClient::ListDir(const std::string &path)
// td0 contains the <a> tag
td_element = lxb_dom_collection_element(td_collection, 0);
lxb_dom_node_t *a_node = NextChildElement(td_element);
// there is no a_node in protected links
if (a_node == nullptr)
{
lxb_dom_collection_destroy(td_collection, true);
continue;
}
value = lxb_dom_element_local_name(lxb_dom_interface_element(a_node), &value_len);
tmp_string = std::string((const char *)value, value_len);
if (tmp_string.compare("a") != 0)
@@ -293,4 +357,4 @@ std::vector<DirEntry> ArchiveOrgClient::ListDir(const std::string &path)
finish:
return out;
}
}
+2 -1
View File
@@ -16,6 +16,7 @@ public:
private:
int Login(const std::string &username, const std::string &password);
std::string GenerateRandomId(const int len);
};
#endif
#endif
+10 -3
View File
@@ -27,9 +27,14 @@ int BaseClient::SetCookies(Headers &headers)
std::string cookie;
for (std::map<std::string, std::string>::iterator it = this->cookies.begin(); it != this->cookies.end();)
{
cookie.append(it->first).append("=").append(it->second).append("; ");
cookie.append(it->first).append("=").append(it->second);
if (std::next(it, 1) != this->cookies.end())
{
cookie.append("; ");
}
++it;
}
headers["Cookie"] = cookie;
headers.emplace("Cookie", cookie);
}
return 1;
@@ -112,6 +117,8 @@ int BaseClient::Size(const std::string &path, int64_t *size)
// example: Content-Range: bytes 0-10/4372785
{
Headers headers = {{"Range", "bytes=0-1"}};
SetCookies(headers);
if (auto range_res = client->Get(GetFullPath(path), headers))
{
if (HTTP_SUCCESS(range_res->status))
@@ -131,7 +138,7 @@ int BaseClient::Size(const std::string &path, int64_t *size)
}
else
{
sprintf(this->response, "%d - %s", res->status, detail::status_message(res->status));
sprintf(this->response, "%d - %s", res->status, http_status_message(res->status));
}
}
else
-2
View File
@@ -24,8 +24,6 @@
#define FTP_CLIENT_READ 1
#define FTP_CLIENT_WRITE 2
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
FtpClient::FtpClient()
{
mp_ftphandle = static_cast<ftphandle *>(calloc(1, sizeof(ftphandle)));
+980 -480
View File
File diff suppressed because it is too large Load Diff
+403 -85
View File
File diff suppressed because it is too large Load Diff