/* * Copyright (C) 2005-2013 Team XBMC * http://xbmc.org * * This Program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This Program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with XBMC; see the file COPYING. If not, see * . * */ #include "XMLUtils.h" #include "ScraperUrl.h" #include "settings/AdvancedSettings.h" #include "HTMLUtil.h" #include "CharsetConverter.h" #include "utils/CharsetDetection.h" #include "utils/StringUtils.h" #include "URL.h" #include "filesystem/CurlFile.h" #include "filesystem/ZipFile.h" #include "URIUtils.h" #include "utils/XBMCTinyXML.h" #include "utils/XMLUtils.h" #include "utils/Mime.h" #include #include using namespace std; CScraperUrl::CScraperUrl(const std::string& strUrl) { relevance = 0; ParseString(strUrl); } CScraperUrl::CScraperUrl(const TiXmlElement* element) { relevance = 0; ParseElement(element); } CScraperUrl::CScraperUrl() { relevance = 0; } CScraperUrl::~CScraperUrl() { } void CScraperUrl::Clear() { m_url.clear(); m_spoof.clear(); m_xml.clear(); relevance = 0; } bool CScraperUrl::Parse() { std::string strToParse = m_xml; m_xml.clear(); return ParseString(strToParse); } bool CScraperUrl::ParseElement(const TiXmlElement* element) { if (!element || !element->FirstChild() || !element->FirstChild()->Value()) return false; stringstream stream; stream << *element; m_xml += stream.str(); SUrlEntry url; url.m_url = element->FirstChild()->Value(); url.m_spoof = XMLUtils::GetAttribute(element, "spoof"); const char* szPost=element->Attribute("post"); if (szPost && stricmp(szPost,"yes") == 0) url.m_post = true; else url.m_post = false; const char* szIsGz=element->Attribute("gzip"); if (szIsGz && stricmp(szIsGz,"yes") == 0) url.m_isgz = true; else url.m_isgz = false; url.m_cache = XMLUtils::GetAttribute(element, "cache"); const char* szType = element->Attribute("type"); url.m_type = URL_TYPE_GENERAL; url.m_season = -1; if (szType && stricmp(szType,"season") == 0) { url.m_type = URL_TYPE_SEASON; const char* szSeason = element->Attribute("season"); if (szSeason) url.m_season = atoi(szSeason); } url.m_aspect = XMLUtils::GetAttribute(element, "aspect"); m_url.push_back(url); return true; } bool CScraperUrl::ParseString(std::string strUrl) { if (strUrl.empty()) return false; CXBMCTinyXML doc; /* strUrl is coming from internal sources (usually generated by scraper or from database) * so strUrl is always in UTF-8 */ doc.Parse(strUrl, TIXML_ENCODING_UTF8); TiXmlElement* pElement = doc.RootElement(); if (!pElement) { SUrlEntry url; url.m_url = strUrl; url.m_type = URL_TYPE_GENERAL; url.m_season = -1; url.m_post = false; url.m_isgz = false; m_url.push_back(url); m_xml = strUrl; } else { while (pElement) { ParseElement(pElement); pElement = pElement->NextSiblingElement(pElement->Value()); } } return true; } const CScraperUrl::SUrlEntry CScraperUrl::GetFirstThumb(const std::string &type) const { for (vector::const_iterator iter=m_url.begin();iter != m_url.end();++iter) { if (iter->m_type == URL_TYPE_GENERAL && (type.empty() || type == "thumb" || iter->m_aspect == type)) return *iter; } SUrlEntry result; result.m_type = URL_TYPE_GENERAL; result.m_post = false; result.m_isgz = false; result.m_season = -1; return result; } const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonThumb(int season, const std::string &type) const { for (vector::const_iterator iter=m_url.begin();iter != m_url.end();++iter) { if (iter->m_type == URL_TYPE_SEASON && iter->m_season == season && (type.empty() || type == "thumb" || iter->m_aspect == type)) return *iter; } SUrlEntry result; result.m_type = URL_TYPE_GENERAL; result.m_post = false; result.m_isgz = false; result.m_season = -1; return result; } unsigned int CScraperUrl::GetMaxSeasonThumb() const { unsigned int maxSeason = 0; for (vector::const_iterator iter=m_url.begin();iter != m_url.end();++iter) { if (iter->m_type == URL_TYPE_SEASON && iter->m_season > 0 && (unsigned int)iter->m_season > maxSeason) maxSeason = iter->m_season; } return maxSeason; } bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const std::string& cacheContext) { CURL url(scrURL.m_url); http.SetReferer(scrURL.m_spoof); std::string strCachePath; if (scrURL.m_isgz) http.SetContentEncoding("gzip"); if (!scrURL.m_cache.empty()) { strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/" + cacheContext + "/" + scrURL.m_cache); if (XFILE::CFile::Exists(strCachePath)) { XFILE::CFile file; XFILE::auto_buffer buffer; if (file.LoadFile(strCachePath, buffer) > 0) { strHTML.assign(buffer.get(), buffer.length()); return true; } } } std::string strHTML1(strHTML); if (scrURL.m_post) { std::string strOptions = url.GetOptions(); strOptions = strOptions.substr(1); url.SetOptions(""); if (!http.Post(url.Get(), strOptions, strHTML1)) return false; } else if (!http.Get(url.Get(), strHTML1)) return false; strHTML = strHTML1; std::string mimeType(http.GetMimeType()); CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType); if (ftype == CMime::FileTypeUnknown) ftype = CMime::GetFileTypeFromContent(strHTML); if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip) { XFILE::CZipFile file; std::string strBuffer; int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz? if (iSize > 0) { strHTML = strBuffer; CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str()); } else CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str()); } std::string reportedCharset(http.GetServerReportedCharset()); if (ftype == CMime::FileTypeHtml) { std::string realHtmlCharset, converted; if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset)) CLog::Log(LOGWARNING, "%s: Can't find precise charset for HTML \"%s\", using \"%s\" as fallback", __FUNCTION__, scrURL.m_url.c_str(), realHtmlCharset.c_str()); else CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for HTML \"%s\"", __FUNCTION__, realHtmlCharset.c_str(), scrURL.m_url.c_str()); strHTML = converted; } else if (ftype == CMime::FileTypeXml) { CXBMCTinyXML xmlDoc; xmlDoc.Parse(strHTML, reportedCharset); std::string realXmlCharset(xmlDoc.GetUsedCharset()); if (!realXmlCharset.empty()) { CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for XML \"%s\"", __FUNCTION__, realXmlCharset.c_str(), scrURL.m_url.c_str()); std::string converted; g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted); strHTML = converted; } } else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0) { std::string realTextCharset, converted; CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset); strHTML = converted; if (reportedCharset != realTextCharset) CLog::Log(LOGWARNING, "%s: Using \"%s\" charset for plain text \"%s\" instead of server reported \"%s\" charset", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str(), reportedCharset.c_str()); else CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for plain text \"%s\"", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str()); } else if (!reportedCharset.empty()) { CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, reportedCharset.c_str(), scrURL.m_url.c_str()); if (reportedCharset != "UTF-8") { std::string converted; g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted); strHTML = converted; } } else CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str()); if (!scrURL.m_cache.empty()) { std::string strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/" + cacheContext + "/" + scrURL.m_cache); XFILE::CFile file; if (!file.OpenForWrite(strCachePath, true) || file.Write(strHTML.data(), strHTML.size()) != strHTML.size()) return false; } return true; } // XML format is of strUrls is: // ...... (parsed by ParseElement) or ... (ditto) bool CScraperUrl::ParseEpisodeGuide(std::string strUrls) { if (strUrls.empty()) return false; // ok, now parse the xml file CXBMCTinyXML doc; /* strUrls is coming from internal sources so strUrls is always in UTF-8 */ doc.Parse(strUrls, TIXML_ENCODING_UTF8); if (doc.RootElement()) { TiXmlHandle docHandle( &doc ); TiXmlElement *link = docHandle.FirstChild("episodeguide").Element(); if (link->FirstChildElement("url")) { for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url")) ParseElement(link); } else if (link->FirstChild() && link->FirstChild()->Value()) ParseElement(link); } else return false; return true; } std::string CScraperUrl::GetThumbURL(const CScraperUrl::SUrlEntry &entry) { if (entry.m_spoof.empty()) return entry.m_url; return entry.m_url + "|Referer=" + CURL::Encode(entry.m_spoof); } void CScraperUrl::GetThumbURLs(std::vector &thumbs, const std::string &type, int season) const { for (vector::const_iterator iter = m_url.begin(); iter != m_url.end(); ++iter) { if (iter->m_aspect == type || type.empty() || type == "thumb" || iter->m_aspect.empty()) { if ((iter->m_type == CScraperUrl::URL_TYPE_GENERAL && season == -1) || (iter->m_type == CScraperUrl::URL_TYPE_SEASON && iter->m_season == season)) { thumbs.push_back(GetThumbURL(*iter)); } } } }