aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rwxr-xr-xdevscripts/release.sh2
-rw-r--r--test/test_download.py23
-rw-r--r--test/test_youtube_lists.py6
-rw-r--r--test/tests.json221
-rw-r--r--youtube_dl/FileDownloader.py3
-rwxr-xr-xyoutube_dl/InfoExtractors.py972
-rw-r--r--youtube_dl/__init__.py2
-rw-r--r--youtube_dl/utils.py9
-rw-r--r--youtube_dl/version.py2
10 files changed, 682 insertions, 560 deletions
diff --git a/README.md b/README.md
index 2f3c81a7c..ccab537e7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
% YOUTUBE-DL(1)
# NAME
-youtube-dl
+youtube-dl - download videos from youtube.com or other video platforms
# SYNOPSIS
**youtube-dl** [OPTIONS] URL [URL...]
diff --git a/devscripts/release.sh b/devscripts/release.sh
index b2a91f817..b8efdab47 100755
--- a/devscripts/release.sh
+++ b/devscripts/release.sh
@@ -22,7 +22,7 @@ if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit
/bin/echo -e "\n### First of all, testing..."
make cleanall
-nosetests --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1
+nosetests --verbose --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1
/bin/echo -e "\n### Changing version in version.py..."
sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py
diff --git a/test/test_download.py b/test/test_download.py
index 3eca333f2..84b3204fe 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -7,8 +7,8 @@ import os
import json
import unittest
import sys
-import hashlib
import socket
+import binascii
# Allow direct execution
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -38,11 +38,16 @@ def _try_rm(filename):
if ose.errno != errno.ENOENT:
raise
+md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
+
class FileDownloader(youtube_dl.FileDownloader):
def __init__(self, *args, **kwargs):
self.to_stderr = self.to_screen
self.processed_info_dicts = []
return youtube_dl.FileDownloader.__init__(self, *args, **kwargs)
+ def report_warning(self, message):
+ # Don't accept warnings during tests
+ raise ExtractorError(message)
def process_info(self, info_dict):
self.processed_info_dicts.append(info_dict)
return youtube_dl.FileDownloader.process_info(self, info_dict)
@@ -121,7 +126,21 @@ def generator(test_case):
with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof:
info_dict = json.load(infof)
for (info_field, value) in tc.get('info_dict', {}).items():
- self.assertEqual(value, info_dict.get(info_field))
+ if isinstance(value, compat_str) and value.startswith('md5:'):
+ self.assertEqual(value, 'md5:' + md5(info_dict.get(info_field)))
+ else:
+ self.assertEqual(value, info_dict.get(info_field), u'invalid value for field ' + info_field)
+
+ # If checkable fields are missing from the test case, print the info_dict
+ test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
+ for key, value in info_dict.items()
+ if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location'))
+ if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()):
+ sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=2) + u'\n')
+
+ # Check for the presence of mandatory fields
+ for key in ('id', 'url', 'title', 'ext'):
+ self.assertTrue(key in info_dict.keys() and info_dict[key])
finally:
for tc in test_cases:
_try_rm(tc['file'])
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
index b842e6cc1..e8b49ff8e 100644
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -104,5 +104,11 @@ class TestYoutubeLists(unittest.TestCase):
result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
self.assertTrue(len(result['entries']) >= 320)
+ def test_youtube_safe_search(self):
+ dl = FakeDownloader()
+ ie = YoutubePlaylistIE(dl)
+ result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0]
+ self.assertEqual(len(result['entries']), 2)
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/tests.json b/test/tests.json
index 6c2373321..b112e6318 100644
--- a/test/tests.json
+++ b/test/tests.json
@@ -15,43 +15,76 @@
"name": "Dailymotion",
"md5": "392c4b85a60a90dc4792da41ce3144eb",
"url": "http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech",
- "file": "x33vw9.mp4"
+ "file": "x33vw9.mp4",
+ "info_dict": {
+ "uploader": "Alex and Van .",
+ "title": "Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
+ }
},
{
"name": "Metacafe",
"add_ie": ["Youtube"],
"url": "http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
- "file": "_aUehQsCQtM.flv"
+ "file": "_aUehQsCQtM.flv",
+ "info_dict": {
+ "upload_date": "20090102",
+ "title": "The Electric Company | \"Short I\" | PBS KIDS GO!",
+ "description": "md5:2439a8ef6d5a70e380c22f5ad323e5a8",
+ "uploader": "PBS",
+ "uploader_id": "PBS"
+ }
},
{
"name": "BlipTV",
"md5": "b2d849efcf7ee18917e4b4d9ff37cafe",
"url": "http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352",
- "file": "5779306.m4v"
+ "file": "5779306.m4v",
+ "info_dict": {
+ "upload_date": "20111205",
+ "description": "md5:9bc31f227219cde65e47eeec8d2dc596",
+ "uploader": "Comic Book Resources - CBR TV",
+ "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3"
+ }
},
{
"name": "XVideos",
"md5": "1d0c835822f0a71a7bf011855db929d0",
"url": "http://www.xvideos.com/video939581/funny_porns_by_s_-1",
- "file": "939581.flv"
+ "file": "939581.flv",
+ "info_dict": {
+ "title": "Funny Porns By >>>>S<<<<<< -1"
+ }
},
{
"name": "YouPorn",
"md5": "c37ddbaaa39058c76a7e86c6813423c1",
"url": "http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/",
- "file": "505835.mp4"
+ "file": "505835.mp4",
+ "info_dict": {
+ "upload_date": "20101221",
+ "description": "Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
+ "uploader": "Ask Dan And Jennifer",
+ "title": "Sex Ed: Is It Safe To Masturbate Daily?"
+ }
},
{
"name": "Pornotube",
"md5": "374dd6dcedd24234453b295209aa69b6",
"url": "http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing",
- "file": "1689755.flv"
+ "file": "1689755.flv",
+ "info_dict": {
+ "upload_date": "20090708",
+ "title": "Marilyn-Monroe-Bathing"
+ }
},
{
"name": "YouJizz",
"md5": "07e15fa469ba384c7693fd246905547c",
"url": "http://www.youjizz.com/videos/zeichentrick-1-2189178.html",
- "file": "2189178.flv"
+ "file": "2189178.flv",
+ "info_dict": {
+ "title": "Zeichentrick 1"
+ }
},
{
"name": "Vimeo",
@@ -70,61 +103,103 @@
"name": "Soundcloud",
"md5": "ebef0a451b909710ed1d7787dddbf0d7",
"url": "http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy",
- "file": "62986583.mp3"
+ "file": "62986583.mp3",
+ "info_dict": {
+ "upload_date": "20121011",
+ "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
+ "uploader": "E.T. ExTerrestrial Music",
+ "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
+ }
},
{
"name": "StanfordOpenClassroom",
"md5": "544a9468546059d4e80d76265b0443b8",
"url": "http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100",
- "file": "PracticalUnix_intro-environment.mp4"
+ "file": "PracticalUnix_intro-environment.mp4",
+ "info_dict": {
+ "title": "Intro Environment"
+ }
},
{
"name": "XNXX",
"md5": "0831677e2b4761795f68d417e0b7b445",
"url": "http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_",
- "file": "1135332.flv"
+ "file": "1135332.flv",
+ "info_dict": {
+ "title": "lida » Naked Funny Actress (5)"
+ }
},
{
"name": "Youku",
"url": "http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
"file": "XNDgyMDQ2NTQw_part00.flv",
"md5": "ffe3f2e435663dc2d1eea34faeff5b5b",
- "params": { "test": false }
+ "params": { "test": false },
+ "info_dict": {
+ "title": "youtube-dl test video \"'/\\ä↭𝕐"
+ }
},
{
"name": "NBA",
"url": "http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html",
"file": "0021200253-okc-bkn-recap.nba.mp4",
- "md5": "c0edcfc37607344e2ff8f13c378c88a4"
+ "md5": "c0edcfc37607344e2ff8f13c378c88a4",
+ "info_dict": {
+ "description": "Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.",
+ "title": "Thunder vs. Nets"
+ }
},
{
"name": "JustinTV",
"url": "http://www.twitch.tv/thegamedevhub/b/296128360",
"file": "296128360.flv",
- "md5": "ecaa8a790c22a40770901460af191c9a"
+ "md5": "ecaa8a790c22a40770901460af191c9a",
+ "info_dict": {
+ "upload_date": "20110927",
+ "uploader_id": 25114803,
+ "uploader": "thegamedevhub",
+ "title": "Beginner Series - Scripting With Python Pt.1"
+ }
},
{
"name": "MyVideo",
"url": "http://www.myvideo.de/watch/8229274/bowling_fail_or_win",
"file": "8229274.flv",
- "md5": "2d2753e8130479ba2cb7e0a37002053e"
+ "md5": "2d2753e8130479ba2cb7e0a37002053e",
+ "info_dict": {
+ "title": "bowling-fail-or-win"
+ }
},
{
"name": "Escapist",
"url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate",
"file": "6618-Breaking-Down-Baldurs-Gate.mp4",
- "md5": "c6793dbda81388f4264c1ba18684a74d"
+ "md5": "c6793dbda81388f4264c1ba18684a74d",
+ "info_dict": {
+ "description": "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
+ "uploader": "the-escapist-presents",
+ "title": "Breaking Down Baldur's Gate"
+ }
},
{
"name": "GooglePlus",
"url": "https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH",
- "file": "ZButuJc6CtH.flv"
+ "file": "ZButuJc6CtH.flv",
+ "info_dict": {
+ "upload_date": "20120613",
+ "uploader": "井上ヨシマサ",
+ "title": "嘆きの天使 降臨"
+ }
},
{
"name": "FunnyOrDie",
"url": "http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version",
"file": "0732f586d7.mp4",
- "md5": "f647e9e90064b53b6e046e75d0241fbd"
+ "md5": "f647e9e90064b53b6e046e75d0241fbd",
+ "info_dict": {
+ "description": "Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.",
+ "title": "Heart-Shaped Box: Literal Video Version"
+ }
},
{
"name": "Steam",
@@ -161,6 +236,7 @@
"url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things",
"file": "12-jan-pythonthings.mp4",
"info_dict": {
+ "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.",
"title": "A Few of My Favorite [Python] Things"
},
"params": {
@@ -173,7 +249,10 @@
"file": "422212.mp4",
"md5": "4e2f5cb088a83cd8cdb7756132f9739d",
"info_dict": {
- "title": "thedailyshow-kristen-stewart part 1"
+ "upload_date": "20121214",
+ "description": "Kristen Stewart",
+ "uploader": "thedailyshow",
+ "title": "thedailyshow-kristen-stewart part 1"
}
},
{
@@ -224,42 +303,48 @@
"file": "11885679.m4a",
"md5": "d30b5b5f74217410f4689605c35d1fd7",
"info_dict": {
- "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad"
+ "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
"file": "11885680.m4a",
"md5": "4eb0a669317cd725f6bbd336a29f923a",
"info_dict": {
- "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad"
+ "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
"file": "11885682.m4a",
"md5": "1893e872e263a2705558d1d319ad19e8",
"info_dict": {
- "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad"
+ "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
"file": "11885683.m4a",
"md5": "b673c46f47a216ab1741ae8836af5899",
"info_dict": {
- "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad"
+ "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
"file": "11885684.m4a",
"md5": "1d74534e95df54986da7f5abf7d842b7",
"info_dict": {
- "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad"
+ "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
},
{
"file": "11885685.m4a",
"md5": "f081f47af8f6ae782ed131d38b9cd1c0",
"info_dict": {
- "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad"
+ "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad",
+ "uploader_id": "ytdl"
}
}
]
@@ -270,18 +355,18 @@
"file": "NODfbab.mp4",
"md5": "9b0636f8c0f7614afa4ea5e4c6e57e83",
"info_dict": {
+ "uploader": "ytdl",
"title": "test chars: \"'/\\ä<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ."
}
-
},
{
"name": "TED",
"url": "http://www.ted.com/talks/dan_dennett_on_our_consciousness.html",
"file": "102.mp4",
- "md5": "7bc087e71d16f18f9b8ab9fa62a8a031",
+ "md5": "8cd9dfa41ee000ce658fd48fb5d89a61",
"info_dict": {
"title": "Dan Dennett: The illusion of consciousness",
- "thumbnail": "http://images.ted.com/images/ted/488_389x292.jpg"
+ "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922"
}
},
{
@@ -290,14 +375,19 @@
"file": "11741.mp4",
"md5": "0b49f4844a068f8b33f4b7c88405862b",
"info_dict": {
- "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2"
+ "description": "Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",
+ "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2"
}
},
{
"name": "Generic",
"url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html",
"file": "13601338388002.mp4",
- "md5": "85b90ccc9d73b4acd9138d3af4c27f89"
+ "md5": "85b90ccc9d73b4acd9138d3af4c27f89",
+ "info_dict": {
+ "uploader": "www.hodiho.fr",
+ "title": "Régis plante sa Jeep"
+ }
},
{
"name": "Spiegel",
@@ -325,7 +415,7 @@
"file": "wshh6a7q1ny0G34ZwuIO.mp4",
"md5": "9d04de741161603bf7071bbf4e883186",
"info_dict": {
- "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! "
+ "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
},
{
@@ -340,11 +430,11 @@
},
{
"name": "Tumblr",
- "url": "http://birthdayproject2012.tumblr.com/post/17258355236/a-sample-video-from-leeann-if-you-need-an-idea",
- "file": "17258355236.mp4",
- "md5": "7c6a514d691b034ccf8567999e9e88a3",
+ "url": "http://resigno.tumblr.com/post/53364321212/e-de-extrema-importancia-que-esse-video-seja",
+ "file": "53364321212.mp4",
+ "md5": "0716d3dd51baf68a28b40fdf1251494e",
"info_dict": {
- "title": "Calling all Pris! - A sample video from LeeAnn. (If you need an idea..."
+ "title": "Rafael Lemos | Tumblr"
}
},
{
@@ -355,42 +445,59 @@
"file":"30510138.mp3",
"md5":"f9136bf103901728f29e419d2c70f55d",
"info_dict": {
- "title":"D-D-Dance"
+ "upload_date": "20111213",
+ "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
+ "uploader": "The Royal Concept",
+ "title": "D-D-Dance"
}
},
{
"file":"47127625.mp3",
"md5":"09b6758a018470570f8fd423c9453dd8",
"info_dict": {
- "title":"The Royal Concept - Gimme Twice"
+ "upload_date": "20120521",
+ "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
+ "uploader": "The Royal Concept",
+ "title": "The Royal Concept - Gimme Twice"
}
},
{
"file":"47127627.mp3",
"md5":"154abd4e418cea19c3b901f1e1306d9c",
"info_dict": {
- "title":"Goldrushed"
+ "upload_date": "20120521",
+ "uploader": "The Royal Concept",
+ "title": "Goldrushed"
}
},
{
"file":"47127629.mp3",
"md5":"2f5471edc79ad3f33a683153e96a79c1",
"info_dict": {
- "title":"In the End"
+ "upload_date": "20120521",
+ "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com",
+ "uploader": "The Royal Concept",
+ "title": "In the End"
}
},
{
"file":"47127631.mp3",
"md5":"f9ba87aa940af7213f98949254f1c6e2",
"info_dict": {
- "title":"Knocked Up"
+ "upload_date": "20120521",
+ "description": "The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com",
+ "uploader": "The Royal Concept",
+ "title": "Knocked Up"
}
},
{
"file":"75206121.mp3",
"md5":"f9d1fe9406717e302980c30de4af9353",
"info_dict": {
- "title":"World On Fire"
+ "upload_date": "20130116",
+ "description": "The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ",
+ "uploader": "The Royal Concept",
+ "title": "World On Fire"
}
}
]
@@ -419,8 +526,10 @@
"url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0",
"file": "zpsc0c3b9fa.mp4",
"md5": "7dabfb92b0a31f6c16cebc0f8e60ff99",
- "info_dict":{
- "title":"Tired of Link Building? Try BacklinkMyDomain.com!"
+ "info_dict": {
+ "upload_date": "20130504",
+ "uploader": "rachaneronas",
+ "title": "Tired of Link Building? Try BacklinkMyDomain.com!"
}
},
{
@@ -488,8 +597,10 @@
"url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html",
"file": "1509445.flv",
"md5": "9f48e0e8d58e3076bb236ff412ab62fa",
- "info_dict":{
- "title":"FemaleAgent Shy beauty takes the bait"
+ "info_dict": {
+ "upload_date": "20121014",
+ "uploader_id": "Ruseful2011",
+ "title": "FemaleAgent Shy beauty takes the bait"
}
},
{
@@ -498,7 +609,7 @@
"file": "1v6ga.mp3",
"md5": "b9cc91b5af8995e9f0c1cee04c575828",
"info_dict":{
- "title":"TAME"
+ "title":"Tame"
}
},
{
@@ -509,5 +620,25 @@
"info_dict":{
"title":"Смях! Чудо - чист за секунди - Скрита камера"
}
+ },
+ {
+ "name": "Gametrailers",
+ "url": "http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer",
+ "file": "zbvr8i.flv",
+ "md5": "c3edbc995ab4081976e16779bd96a878",
+ "info_dict": {
+ "title": "E3 2013: Debut Trailer"
+ },
+ "skip": "Requires rtmpdump"
+ },
+ {
+ "name": "Statigram",
+ "url": "http://statigr.am/p/484091715184808010_284179915",
+ "file": "484091715184808010_284179915.mp4",
+ "md5": "deda4ff333abe2e118740321e992605b",
+ "info_dict": {
+ "uploader": "videoseconds",
+ "title": "Instagram photo by @videoseconds (Videos)"
+ }
}
]
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py
index 72f03c217..f4ce48046 100644
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -322,6 +322,9 @@ class FileDownloader(object):
filetime = timeconvert(timestr)
if filetime is None:
return filetime
+ # Ignore obviously invalid dates
+ if filetime == 0:
+ return
try:
os.utime(filename, (time.time(), filetime))
except:
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index e27e0cb7c..574d417be 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -191,6 +191,47 @@ class InfoExtractor(object):
video_info['title'] = playlist_title
return video_info
+ def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ """
+ Perform a regex search on the given string, using a single or a list of
+ patterns returning the first matching group.
+ In case of failure return a default value or raise a WARNING or a
+ ExtractorError, depending on fatal, specifying the field name.
+ """
+ if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+ mobj = re.search(pattern, string, flags)
+ else:
+ for p in pattern:
+ mobj = re.search(p, string, flags)
+ if mobj: break
+
+ if sys.stderr.isatty() and os.name != 'nt':
+ _name = u'\033[0;34m%s\033[0m' % name
+ else:
+ _name = name
+
+ if mobj:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ elif default is not None:
+ return default
+ elif fatal:
+ raise ExtractorError(u'Unable to extract %s' % _name)
+ else:
+ self._downloader.report_warning(u'unable to extract %s; '
+ u'please report this issue on GitHub.' % _name)
+ return None
+
+ def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ """
+ Like _search_regex, but strips HTML tags and unescapes entities.
+ """
+ res = self._search_regex(pattern, string, name, default, fatal, flags)
+ if res:
+ return clean_html(res).strip()
+ else:
+ return res
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
@@ -379,7 +420,7 @@ class YoutubeIE(InfoExtractor):
def _request_automatic_caption(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
- sub_lang = self._downloader.params.get('subtitleslang')
+ sub_lang = self._downloader.params.get('subtitleslang') or 'en'
sub_format = self._downloader.params.get('subtitlesformat')
self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
@@ -658,14 +699,14 @@ class YoutubeIE(InfoExtractor):
pass
else:
# We report the original error
- self._downloader.report_error(sub_error)
+ self._downloader.report_warning(sub_error)
if self._downloader.params.get('allsubtitles', False):
video_subtitles = self._extract_all_subtitles(video_id)
for video_subtitle in video_subtitles:
(sub_error, sub_lang, sub) = video_subtitle
if sub_error:
- self._downloader.report_error(sub_error)
+ self._downloader.report_warning(sub_error)
if self._downloader.params.get('listsubtitles', False):
sub_lang_list = self._list_available_subtitles(video_id)
@@ -691,8 +732,11 @@ class YoutubeIE(InfoExtractor):
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' in url_data and 'url' in url_data:
- url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
- if not 'ratebypass' in url: url += '&ratebypass=yes'
+ url = url_data['url'][0]
+ if 'sig' in url_data:
+ url += '&signature=' + url_data['sig'][0]
+ if 'ratebypass' not in url:
+ url += '&ratebypass=yes'
url_map[url_data['itag'][0]] = url
format_limit = self._downloader.params.get('format_limit', None)
@@ -899,16 +943,10 @@ class DailymotionIE(InfoExtractor):
video_title = unescapeHTML(mobj.group('title'))
video_uploader = None
- mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
- if mobj is None:
- # lookin for official user
- mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
- if mobj_official is None:
- self._downloader.report_warning(u'unable to extract uploader nickname')
- else:
- video_uploader = mobj_official.group(1)
- else:
- video_uploader = mobj.group(1)
+ video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
+ # Looking for official user
+ r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
+ webpage, 'video uploader')
video_upload_date = None
mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
@@ -964,18 +1002,13 @@ class PhotobucketIE(InfoExtractor):
}]
# We try looking in other parts of the webpage
- mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
- mediaURL = compat_urllib_parse.unquote(mobj.group(1))
-
- video_url = mediaURL
+ video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
+ webpage, u'video URL')
mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract title')
video_title = mobj.group(1).decode('utf-8')
-
video_uploader = mobj.group(2).decode('utf-8')
return [{
@@ -1400,6 +1433,13 @@ class GenericIE(InfoExtractor):
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
if mobj is None:
+ # We look for Open Graph info:
+ # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
+ m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
+ # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
+ if m_video_type is not None:
+ mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
+ if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
# It's possible that one of the regexes
@@ -1420,16 +1460,12 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- mobj = re.search(r'<title>(.*)</title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex(r'<title>(.*)</title>',
+ webpage, u'video title')
# video uploader is domain name
- mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_uploader = mobj.group(1)
+ video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+ url, u'video uploader')
return [{
'id': video_id,
@@ -1567,7 +1603,7 @@ class YoutubePlaylistIE(InfoExtractor):
|
((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
+ _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
_MAX_RESULTS = 50
IE_NAME = u'youtube:playlist'
@@ -1603,9 +1639,10 @@ class YoutubePlaylistIE(InfoExtractor):
# Number of videos is a multiple of self._MAX_RESULTS
break
- videos += [ (entry['yt$position']['$t'], entry['content']['src'])
- for entry in response['feed']['entry']
- if 'content' in entry ]
+ for entry in response['feed']['entry']:
+ index = entry['yt$position']['$t']
+ if 'media$group' in entry and 'media$player' in entry['media$group']:
+ videos.append((index, entry['media$group']['media$player']['url']))
if len(response['feed']['entry']) < self._MAX_RESULTS:
break
@@ -1828,10 +1865,7 @@ class DepositFilesIE(InfoExtractor):
file_extension = os.path.splitext(file_url)[1][1:]
# Search for file title
- mobj = re.search(r'<b title="(.*?)">', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- file_title = mobj.group(1).decode('utf-8')
+ file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
return [{
'id': file_id.decode('utf-8'),
@@ -1925,10 +1959,8 @@ class FacebookIE(InfoExtractor):
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
- m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
- if not m:
- raise ExtractorError(u'Cannot find title in webpage')
- video_title = unescapeHTML(m.group(1))
+ video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+ webpage, u'title')
info = {
'id': video_id,
@@ -2090,15 +2122,10 @@ class MyVideoIE(InfoExtractor):
self.report_extraction(video_id)
video_url = mobj.group(1) + '.flv'
- mobj = re.search('<title>([^<]+)</title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex('<title>([^<]+)</title>',
+ webpage, u'title')
- mobj = re.search('[.](.+?)$', video_url)
- if mobj is None:
- raise ExtractorError(u'Unable to extract extention')
- video_ext = mobj.group(1)
+ video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
return [{
'id': video_id,
@@ -2146,25 +2173,23 @@ class MyVideoIE(InfoExtractor):
# extracting infos
self.report_extraction(video_id)
+ video_url = None
mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
- if mobj is None:
- raise ExtractorError(u'unable to extract rtmpurl')
- video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
- if 'myvideo2flash' in video_rtmpurl:
- self._downloader.report_warning(u'forcing RTMPT ...')
- video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
-
- # extract non rtmp videos
- if (video_rtmpurl is None) or (video_rtmpurl == ''):
+ if mobj:
+ video_url = compat_urllib_parse.unquote(mobj.group(1))
+ if 'myvideo2flash' in video_url:
+ self._downloader.report_warning(u'forcing RTMPT ...')
+ video_url = video_url.replace('rtmpe://', 'rtmpt://')
+
+ if not video_url:
+ # extract non rtmp videos
mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
if mobj is None:
raise ExtractorError(u'unable to extract url')
- video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+ video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
- mobj = re.search('source=\'(.*?)\'', dec_data)
- if mobj is None:
- raise ExtractorError(u'unable to extract swfobj')
- video_file = compat_urllib_parse.unquote(mobj.group(1))
+ video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
+ video_file = compat_urllib_parse.unquote(video_file)
if not video_file.endswith('f4m'):
ppath, prefix = video_file.split('.')
@@ -2176,20 +2201,16 @@ class MyVideoIE(InfoExtractor):
video_filepath + video_file
).replace('.f4m', '.m3u8')
- mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
- if mobj is None:
- raise ExtractorError(u'unable to extract swfobj')
- video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
+ video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
+ video_swfobj = compat_urllib_parse.unquote(video_swfobj)
- mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
- if mobj is None:
- raise ExtractorError(u'unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+ webpage, u'title')
return [{
'id': video_id,
- 'url': video_rtmpurl,
- 'tc_url': video_rtmpurl,
+ 'url': video_url,
+ 'tc_url': video_url,
'uploader': None,
'upload_date': None,
'title': video_title,
@@ -2200,6 +2221,7 @@ class MyVideoIE(InfoExtractor):
'player_url': video_swfobj,
}]
+
class ComedyCentralIE(InfoExtractor):
"""Information extractor for The Daily Show and Colbert Report """
@@ -2381,19 +2403,25 @@ class EscapistIE(InfoExtractor):
showName = mobj.group('showname')
videoId = mobj.group('episode')
- self.report_extraction(showName)
- webPage = self._download_webpage(url, showName)
+ self.report_extraction(videoId)
+ webpage = self._download_webpage(url, videoId)
+
+ videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
+ webpage, u'description', fatal=False)
+
+ imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
+ webpage, u'thumbnail', fatal=False)
+
+ playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
+ webpage, u'player url')
- descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
- description = unescapeHTML(descMatch.group(1))
- imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
- imgUrl = unescapeHTML(imgMatch.group(1))
- playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
- playerUrl = unescapeHTML(playerUrlMatch.group(1))
- configUrlMatch = re.search('config=(.*)$', playerUrl)
- configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
+ title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+ webpage, u'player url').split(' : ')[-1]
- configJSON = self._download_webpage(configUrl, showName,
+ configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+ configUrl = compat_urllib_parse.unquote(configUrl)
+
+ configJSON = self._download_webpage(configUrl, videoId,
u'Downloading configuration',
u'unable to download configuration')
@@ -2413,10 +2441,10 @@ class EscapistIE(InfoExtractor):
'url': videoUrl,
'uploader': showName,
'upload_date': None,
- 'title': showName,
+ 'title': title,
'ext': 'mp4',
'thumbnail': imgUrl,
- 'description': description,
+ 'description': videoDesc,
'player_url': playerUrl,
}
@@ -2501,26 +2529,17 @@ class XVideosIE(InfoExtractor):
self.report_extraction(video_id)
-
# Extract video URL
- mobj = re.search(r'flv_url=(.+?)&', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video url')
- video_url = compat_urllib_parse.unquote(mobj.group(1))
-
+ video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
+ webpage, u'video URL'))
# Extract title
- mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video title')
- video_title = mobj.group(1)
-
+ video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
+ webpage, u'title')
# Extract video thumbnail
- mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video thumbnail')
- video_thumbnail = mobj.group(0)
+ video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
+ webpage, u'thumbnail', fatal=False)
info = {
'id': video_id,
@@ -2677,16 +2696,12 @@ class InfoQIE(InfoExtractor):
video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
# Extract title
- mobj = re.search(r'contentTitle = "(.*?)";', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video title')
- video_title = mobj.group(1)
+ video_title = self._search_regex(r'contentTitle = "(.*?)";',
+ webpage, u'title')
# Extract description
- video_description = u'No description available.'
- mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
- if mobj is not None:
- video_description = mobj.group(1)
+ video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+ webpage, u'description', fatal=False)
video_filename = video_url.split('/')[-1]
video_id, extension = video_filename.split('.')
@@ -2857,15 +2872,10 @@ class StanfordOpenClassroomIE(InfoExtractor):
note='Downloading course info page',
errnote='Unable to download course info page')
- m = re.search('<h1>([^<]+)</h1>', coursepage)
- if m:
- info['title'] = unescapeHTML(m.group(1))
- else:
- info['title'] = info['id']
+ info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
- m = re.search('<description>([^<]+)</description>', coursepage)
- if m:
- info['description'] = unescapeHTML(m.group(1))
+ info['description'] = self._html_search_regex('<description>([^<]+)</description>',
+ coursepage, u'description', fatal=False)
links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
info['list'] = [
@@ -2926,25 +2936,17 @@ class MTVIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract song name')
- song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
- mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract performer')
- performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
- video_title = performer + ' - ' + song_name
+ song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+ webpage, u'song name', fatal=False)
- mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to mtvn_uri')
- mtvn_uri = mobj.group(1)
+ video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+ webpage, u'title')
- mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract content id')
- content_id = mobj.group(1)
+ mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+ webpage, u'mtvn_uri', fatal=False)
+
+ content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
+ webpage, u'content id', fatal=False)
videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
self.report_extraction(video_id)
@@ -3092,20 +3094,15 @@ class XNXXIE(InfoExtractor):
# Get webpage content
webpage = self._download_webpage(url, video_id)
- result = re.search(self.VIDEO_URL_RE, webpage)
- if result is None:
- raise ExtractorError(u'Unable to extract video url')
- video_url = compat_urllib_parse.unquote(result.group(1))
+ video_url = self._search_regex(self.VIDEO_URL_RE,
+ webpage, u'video URL')
+ video_url = compat_urllib_parse.unquote(video_url)
- result = re.search(self.VIDEO_TITLE_RE, webpage)
- if result is None:
- raise ExtractorError(u'Unable to extract video title')
- video_title = result.group(1)
+ video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
+ webpage, u'title')
- result = re.search(self.VIDEO_THUMB_RE, webpage)
- if result is None:
- raise ExtractorError(u'Unable to extract video thumbnail')
- video_thumbnail = result.group(1)
+ video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
+ webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
@@ -3125,26 +3122,6 @@ class GooglePlusIE(InfoExtractor):
_VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
IE_NAME = u'plus.google'
- def report_extract_entry(self, url):
- """Report downloading extry"""
- self.to_screen(u'Downloading entry: %s' % url)
-
- def report_date(self, upload_date):
- """Report downloading extry"""
- self.to_screen(u'Entry date: %s' % upload_date)
-
- def report_uploader(self, uploader):
- """Report downloading extry"""
- self.to_screen(u'Uploader: %s' % uploader)
-
- def report_title(self, video_title):
- """Report downloading extry"""
- self.to_screen(u'Title: %s' % video_title)
-
- def report_extract_vid_page(self, video_page):
- """Report information extraction."""
- self.to_screen(u'Extracting video page: %s' % video_page)
-
def _real_extract(self, url):
# Extract id from URL
mobj = re.match(self._VALID_URL, url)
@@ -3157,47 +3134,31 @@ class GooglePlusIE(InfoExtractor):
video_extension = 'flv'
# Step 1, Retrieve post webpage to extract further information
- self.report_extract_entry(post_url)
webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
+ self.report_extraction(video_id)
+
# Extract update date
- upload_date = None
- pattern = 'title="Timestamp">(.*?)</a>'
- mobj = re.search(pattern, webpage)
- if mobj:
- upload_date = mobj.group(1)
+ upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
+ webpage, u'upload date', fatal=False)
+ if upload_date:
# Convert timestring to a format suitable for filename
upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
upload_date = upload_date.strftime('%Y%m%d')
- self.report_date(upload_date)
# Extract uploader
- uploader = None
- pattern = r'rel\="author".*?>(.*?)</a>'
- mobj = re.search(pattern, webpage)
- if mobj:
- uploader = mobj.group(1)
- self.report_uploader(uploader)
+ uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
+ webpage, u'uploader', fatal=False)
# Extract title
# Get the first line for title
- video_title = u'NA'
- pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
- mobj = re.search(pattern, webpage)
- if mobj:
- video_title = mobj.group(1)
- self.report_title(video_title)
+ video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+ webpage, 'title', default=u'NA')
# Step 2, Stimulate clicking the image box to launch video
- pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
- mobj = re.search(pattern, webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video page URL')
-
- video_page = mobj.group(1)
+ video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
+ webpage, u'video page URL')
webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
- self.report_extract_vid_page(video_page)
-
# Extract video links on video page
"""Extract video links of all sizes"""
@@ -3230,7 +3191,7 @@ class GooglePlusIE(InfoExtractor):
}]
class NBAIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+ _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
IE_NAME = u'nba'
def _real_extract(self, url):
@@ -3239,28 +3200,27 @@ class NBAIE(InfoExtractor):
raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group(1)
- if video_id.endswith('/index.html'):
- video_id = video_id[:-len('/index.html')]
webpage = self._download_webpage(url, video_id)
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
- def _findProp(rexp, default=None):
- m = re.search(rexp, webpage)
- if m:
- return unescapeHTML(m.group(1))
- else:
- return default
shortened_video_id = video_id.rpartition('/')[2]
- title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+ title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
+ webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+
+ # It isn't there in the HTML it returns to us
+ # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+
+ description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
+
info = {
'id': shortened_video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
- 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
- 'description': _findProp(r'<div class="description">(.*?)</h1>'),
+ # 'uploader_date': uploader_date,
+ 'description': description,
}
return [info]
@@ -3408,30 +3368,21 @@ class FunnyOrDieIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
- if not m:
- raise ExtractorError(u'Unable to find video information')
- video_url = unescapeHTML(m.group('url'))
+ video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+ webpage, u'video URL', flags=re.DOTALL)
- m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
- if not m:
- m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
- if not m:
- raise ExtractorError(u'Cannot find video title')
- title = clean_html(m.group('title'))
+ title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+ r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
- m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
- if m:
- desc = unescapeHTML(m.group('desc'))
- else:
- desc = None
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ webpage, u'description', fatal=False, flags=re.DOTALL)
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
- 'description': desc,
+ 'description': video_description,
}
return [info]
@@ -3442,6 +3393,8 @@ class SteamIE(InfoExtractor):
(?P<gameID>\d+)/?
(?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
"""
+ _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
+ _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
@classmethod
def suitable(cls, url):
@@ -3451,11 +3404,19 @@ class SteamIE(InfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
gameID = m.group('gameID')
- videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
- self.report_age_confirmation()
+
+ videourl = self._VIDEO_PAGE_TEMPLATE % gameID
webpage = self._download_webpage(videourl, gameID)
- game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
-
+
+ if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
+ videourl = self._AGECHECK_TEMPLATE % gameID
+ self.report_age_confirmation()
+ webpage = self._download_webpage(videourl, gameID)
+
+ self.report_extraction(gameID)
+ game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
+ webpage, 'game title')
+
urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
mweb = re.finditer(urlRE, webpage)
namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
@@ -3487,27 +3448,29 @@ class UstreamIE(InfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
+
video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
webpage = self._download_webpage(url, video_id)
+
self.report_extraction(video_id)
- try:
- m = re.search(r'data-title="(?P<title>.+)"',webpage)
- title = m.group('title')
- m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
- webpage, re.DOTALL)
- uploader = unescapeHTML(m.group('uploader').strip())
- m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
- thumb = m.group('thumb')
- except AttributeError:
- raise ExtractorError(u'Unable to extract info')
+
+ video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
+ webpage, u'title')
+
+ uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+ webpage, u'uploader', fatal=False, flags=re.DOTALL)
+
+ thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+ webpage, u'thumbnail', fatal=False)
+
info = {
- 'id':video_id,
- 'url':video_url,
+ 'id': video_id,
+ 'url': video_url,
'ext': 'flv',
- 'title': title,
+ 'title': video_title,
'uploader': uploader,
- 'thumbnail': thumb,
- }
+ 'thumbnail': thumbnail,
+ }
return info
class WorldStarHipHopIE(InfoExtractor):
@@ -3515,45 +3478,36 @@ class WorldStarHipHopIE(InfoExtractor):
IE_NAME = u'WorldStarHipHop'
def _real_extract(self, url):
- _src_url = r'so\.addVariable\("file","(.*?)"\)'
-
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
- webpage_src = self._download_webpage(url, video_id)
+ webpage_src = self._download_webpage(url, video_id)
- mobj = re.search(_src_url, webpage_src)
+ video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
+ webpage_src, u'video URL')
- if mobj is not None:
- video_url = mobj.group(1)
- if 'mp4' in video_url:
- ext = 'mp4'
- else:
- ext = 'flv'
+ if 'mp4' in video_url:
+ ext = 'mp4'
else:
- raise ExtractorError(u'Cannot find video url for %s' % video_id)
+ ext = 'flv'
- mobj = re.search(r"<title>(.*)</title>", webpage_src)
-
- if mobj is None:
- raise ExtractorError(u'Cannot determine title')
- title = mobj.group(1)
+ video_title = self._html_search_regex(r"<title>(.*)</title>",
+ webpage_src, u'title')
- mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- if mobj is not None:
- thumbnail = mobj.group(1)
- else:
+ thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
+ webpage_src, u'thumbnail', fatal=False)
+
+ if not thumbnail:
_title = r"""candytitles.*>(.*)</span>"""
mobj = re.search(_title, webpage_src)
if mobj is not None:
- title = mobj.group(1)
- thumbnail = None
+ video_title = mobj.group(1)
results = [{
'id': video_id,
'url' : video_url,
- 'title' : title,
+ 'title' : video_title,
'thumbnail' : thumbnail,
'ext' : ext,
}]
@@ -3567,10 +3521,9 @@ class RBMARadioIE(InfoExtractor):
video_id = m.group('videoID')
webpage = self._download_webpage(url, video_id)
- m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
- if not m:
- raise ExtractorError(u'Cannot find metadata')
- json_data = m.group(1)
+
+ json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
+ webpage, u'json data', flags=re.MULTILINE)
try:
data = json.loads(json_data)
@@ -3617,42 +3570,33 @@ class YouPornIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
raise ExtractorError(u'Invalid URL: %s' % url)
-
video_id = mobj.group('videoid')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- # Get the video title
- result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
- if result is None:
- raise ExtractorError(u'Unable to extract video title')
- video_title = result.group('title').strip()
-
- # Get the video date
- result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
- if result is None:
- self._downloader.report_warning(u'unable to extract video date')
- upload_date = None
- else:
- upload_date = unified_strdate(result.group('date').strip())
+ # Get JSON parameters
+ json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+ try:
+ params = json.loads(json_params)
+ except:
+ raise ExtractorError(u'Invalid JSON')
- # Get the video uploader
- result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
- if result is None:
- self._downloader.report_warning(u'unable to extract uploader')
- video_uploader = None
- else:
- video_uploader = result.group('uploader').strip()
- video_uploader = clean_html( video_uploader )
+ self.report_extraction(video_id)
+ try:
+ video_title = params['title']
+ upload_date = unified_strdate(params['release_date_f'])
+ video_description = params['description']
+ video_uploader = params['submitted_by']
+ thumbnail = params['thumbnails'][0]['image']
+ except KeyError:
+ raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
# Get all of the formats available
DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
- result = re.search(DOWNLOAD_LIST_RE, webpage)
- if result is None:
- raise ExtractorError(u'Unable to extract download list')
- download_list_html = result.group('download_list').strip()
+ download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
+ webpage, u'download list').strip()
# Get all of the links from the page
LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
@@ -3676,19 +3620,18 @@ class YouPornIE(InfoExtractor):
size = format[0]
bitrate = format[1]
format = "-".join( format )
- title = u'%s-%s-%s' % (video_title, size, bitrate)
+ # title = u'%s-%s-%s' % (video_title, size, bitrate)
formats.append({
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': upload_date,
- 'title': title,
+ 'title': video_title,
'ext': extension,
'format': format,
- 'thumbnail': None,
- 'description': None,
- 'player_url': None
+ 'thumbnail': thumbnail,
+ 'description': video_description
})
if self._downloader.params.get('listformats', None):
@@ -3729,17 +3672,13 @@ class PornotubeIE(InfoExtractor):
# Get the video URL
VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
- result = re.search(VIDEO_URL_RE, webpage)
- if result is None:
- raise ExtractorError(u'Unable to extract video url')
- video_url = compat_urllib_parse.unquote(result.group('url'))
+ video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
+ video_url = compat_urllib_parse.unquote(video_url)
#Get the uploaded date
VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
- result = re.search(VIDEO_UPLOADED_RE, webpage)
- if result is None:
- raise ExtractorError(u'Unable to extract video title')
- upload_date = unified_strdate(result.group('date'))
+ upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+ if upload_date: upload_date = unified_strdate(upload_date)
info = {'id': video_id,
'url': video_url,
@@ -3766,10 +3705,8 @@ class YouJizzIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
# Get the video title
- result = re.search(r'<title>(?P<title>.*)</title>', webpage)
- if result is None:
- raise ExtractorError(u'ERROR: unable to extract video title')
- video_title = result.group('title').strip()
+ video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
+ webpage, u'title').strip()
# Get the embed page
result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
@@ -3782,10 +3719,8 @@ class YouJizzIE(InfoExtractor):
webpage = self._download_webpage(embed_page_url, video_id)
# Get the video URL
- result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
- if result is None:
- raise ExtractorError(u'ERROR: unable to extract video url')
- video_url = result.group('source')
+ video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
+ webpage, u'video URL')
info = {'id': video_id,
'url': video_url,
@@ -3808,10 +3743,7 @@ class EightTracksIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
- m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
- if not m:
- raise ExtractorError(u'Cannot find trax information')
- json_like = m.group(1)
+ json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
data = json.loads(json_like)
session = str(random.randint(0, 1000000000))
@@ -3847,18 +3779,22 @@ class KeekIE(InfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
+
video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
- title = unescapeHTML(m.group('title'))
- m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
- uploader = clean_html(m.group('uploader'))
+
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ webpage, u'title')
+
+ uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+ webpage, u'uploader', fatal=False)
+
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': title,
+ 'title': video_title,
'thumbnail': thumbnail,
'uploader': uploader
}
@@ -3890,10 +3826,6 @@ class TEDIE(InfoExtractor):
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
- def _talk_video_link(self,mediaSlug):
- '''Returns the video link for that mediaSlug'''
- return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
-
def _playlist_videos_info(self,url,name,playlist_id=0):
'''Returns the videos of the playlist'''
video_RE=r'''
@@ -3906,9 +3838,8 @@ class TEDIE(InfoExtractor):
m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
m_names=re.finditer(video_name_RE,webpage)
- playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
- m_playlist = re.search(playlist_RE, webpage)
- playlist_title = m_playlist.group('playlist_title')
+ playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
+ webpage, 'playlist title')
playlist_entries = []
for m_video, m_name in zip(m_videos,m_names):
@@ -3919,27 +3850,28 @@ class TEDIE(InfoExtractor):
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
- m=re.match(self._VALID_URL, url,re.VERBOSE)
- videoName=m.group('name')
- webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
+ m = re.match(self._VALID_URL, url,re.VERBOSE)
+ video_name = m.group('name')
+ webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
+ self.report_extraction(video_name)
# If the url includes the language we get the title translated
- title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
- title=re.search(title_RE, webpage).group('title')
- info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
- "id":(?P<videoID>[\d]+).*?
- "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
- thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
- thumb_match=re.search(thumb_RE,webpage)
- info_match=re.search(info_RE,webpage,re.VERBOSE)
- video_id=info_match.group('videoID')
- mediaSlug=info_match.group('mediaSlug')
- video_url=self._talk_video_link(mediaSlug)
+ title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
+ webpage, 'title')
+ json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
+ webpage, 'json data')
+ info = json.loads(json_data)
+ desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
+ webpage, 'description', flags = re.DOTALL)
+
+ thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
+ webpage, 'thumbnail')
info = {
- 'id': video_id,
- 'url': video_url,
+ 'id': info['id'],
+ 'url': info['htmlStreams'][-1]['file'],
'ext': 'mp4',
'title': title,
- 'thumbnail': thumb_match.group('thumbnail')
+ 'thumbnail': thumbnail,
+ 'description': desc,
}
return info
@@ -4005,10 +3937,9 @@ class SpiegelIE(InfoExtractor):
video_id = m.group('videoID')
webpage = self._download_webpage(url, video_id)
- m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
- if not m:
- raise ExtractorError(u'Cannot find title')
- video_title = unescapeHTML(m.group(1))
+
+ video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
+ webpage, u'title')
xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
xml_code = self._download_webpage(xml_url, video_id,
@@ -4044,35 +3975,25 @@ class LiveLeakIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- m = re.search(r'file: "(.*?)",', webpage)
- if not m:
- raise ExtractorError(u'Unable to find video url')
- video_url = m.group(1)
+ video_url = self._search_regex(r'file: "(.*?)",',
+ webpage, u'video URL')
- m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
- if not m:
- raise ExtractorError(u'Cannot find video title')
- title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ webpage, u'title').replace('LiveLeak.com -', '').strip()
- m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
- if m:
- desc = unescapeHTML(m.group('desc'))
- else:
- desc = None
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ webpage, u'description', fatal=False)
- m = re.search(r'By:.*?(\w+)</a>', webpage)
- if m:
- uploader = clean_html(m.group(1))
- else:
- uploader = None
+ video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
+ webpage, u'uploader', fatal=False)
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': title,
- 'description': desc,
- 'uploader': uploader
+ 'title': video_title,
+ 'description': video_description,
+ 'uploader': video_uploader
}
return [info]
@@ -4188,23 +4109,23 @@ class TumblrIE(InfoExtractor):
re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
video = re.search(re_video, webpage)
if video is None:
- self.to_screen("No video found")
- return []
+ raise ExtractorError(u'Unable to extract video')
video_url = video.group('video_url')
ext = video.group('ext')
- re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
- thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
+ video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
+ webpage, u'thumbnail', fatal=False) # We pick the first poster
+ if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
- re_title = r'<title>(?P<title>.*?)</title>'
- title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
+ video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
+ webpage, u'title', flags=re.DOTALL)
return [{'id': video_id,
'url': video_url,
- 'title': title,
- 'thumbnail': thumb,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
'ext': ext
}]
@@ -4218,7 +4139,7 @@ class BandcampIE(InfoExtractor):
# We get the link to the free download page
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
- raise ExtractorError(u'No free songs founded')
+ raise ExtractorError(u'No free songs found')
download_link = m_download.group(1)
id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
@@ -4246,10 +4167,10 @@ class BandcampIE(InfoExtractor):
track_info = {'id':id,
'title' : info[u'title'],
- 'ext' : 'mp3',
- 'url' : final_url,
+ 'ext' : 'mp3',
+ 'url' : final_url,
'thumbnail' : info[u'thumb_url'],
- 'uploader' : info[u'artist']
+ 'uploader' : info[u'artist']
}
return [track_info]
@@ -4266,17 +4187,14 @@ class RedTubeIE(InfoExtractor):
video_id = mobj.group('id')
video_extension = 'mp4'
webpage = self._download_webpage(url, video_id)
+
self.report_extraction(video_id)
- mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
+ video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
+ webpage, u'video URL')
- video_url = mobj.group(1)
- mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+ webpage, u'title')
return [{
'id': video_id,
@@ -4297,15 +4215,13 @@ class InaIE(InfoExtractor):
video_extension = 'mp4'
webpage = self._download_webpage(mrss_url, video_id)
- mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
- video_url = mobj.group(1)
+ self.report_extraction(video_id)
- mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+ webpage, u'video URL')
+
+ video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
+ webpage, u'title')
return [{
'id': video_id,
@@ -4327,27 +4243,17 @@ class HowcastIE(InfoExtractor):
self.report_extraction(video_id)
- mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video URL')
- video_url = mobj.group(1)
+ video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
+ webpage, u'video URL')
- mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1) or mobj.group(2)
+ video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+ webpage, u'title')
- mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
- if mobj is None:
- self._downloader.report_warning(u'unable to extract description')
- video_description = None
- else:
- video_description = mobj.group(1) or mobj.group(2)
+ video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+ webpage, u'description', fatal=False)
- mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract thumbnail')
- thumbnail = mobj.group(1)
+ thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+ webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
@@ -4363,7 +4269,6 @@ class VineIE(InfoExtractor):
_VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
def _real_extract(self, url):
-
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
@@ -4372,25 +4277,17 @@ class VineIE(InfoExtractor):
self.report_extraction(video_id)
- mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video URL')
- video_url = mobj.group(1)
+ video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+ webpage, u'video URL')
- mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
+ webpage, u'title')
- mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract thumbnail')
- thumbnail = mobj.group(1)
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+ webpage, u'thumbnail', fatal=False)
- mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
- if mobj is None:
- raise ExtractorError(u'Unable to extract uploader')
- uploader = mobj.group(1)
+ uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+ webpage, u'uploader', fatal=False, flags=re.DOTALL)
return [{
'id': video_id,
@@ -4413,18 +4310,13 @@ class FlickrIE(InfoExtractor):
webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
- mobj = re.search(r"photo_secret: '(\w+)'", webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video secret')
- secret = mobj.group(1)
+ secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
- mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
- if mobj is None:
- raise ExtractorError(u'Unable to extract node_id')
- node_id = mobj.group(1)
+ node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+ first_xml, u'node_id')
second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
@@ -4436,22 +4328,14 @@ class FlickrIE(InfoExtractor):
raise ExtractorError(u'Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
- mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1) or mobj.group(2)
+ video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+ webpage, u'video title')
- mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
- if mobj is None:
- self._downloader.report_warning(u'unable to extract description')
- video_description = None
- else:
- video_description = mobj.group(1) or mobj.group(2)
+ video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+ webpage, u'description', fatal=False)
- mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract thumbnail')
- thumbnail = mobj.group(1) or mobj.group(2)
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+ webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
@@ -4473,32 +4357,25 @@ class TeamcocoIE(InfoExtractor):
url_title = mobj.group('url_title')
webpage = self._download_webpage(url, url_title)
- mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
- video_id = mobj.group(1)
+ video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
+ webpage, u'video id')
self.report_extraction(video_id)
- mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
+ webpage, u'title')
- mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract thumbnail')
- thumbnail = mobj.group(1)
+ thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
+ webpage, u'thumbnail', fatal=False)
- mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract description')
- description = mobj.group(1)
+ video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
+ webpage, u'description', fatal=False)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
- mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video url')
- video_url = mobj.group(1)
+
+ video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
+ data, u'video URL')
return [{
'id': video_id,
@@ -4506,9 +4383,9 @@ class TeamcocoIE(InfoExtractor):
'ext': 'mp4',
'title': video_title,
'thumbnail': thumbnail,
- 'description': description,
+ 'description': video_description,
}]
-
+
class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
_VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
@@ -4517,8 +4394,9 @@ class XHamsterIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- mrss_url='http://xhamster.com/movies/%s/.html' % video_id
+ mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
webpage = self._download_webpage(mrss_url, video_id)
+
mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
if mobj is None:
raise ExtractorError(u'Unable to extract media URL')
@@ -4528,39 +4406,33 @@ class XHamsterIE(InfoExtractor):
video_url = mobj.group('server')+'/key='+mobj.group('file')
video_extension = video_url.split('.')[-1]
- mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = unescapeHTML(mobj.group('title'))
+ video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+ webpage, u'title')
- mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
- if mobj is None:
- video_description = u''
- else:
- video_description = unescapeHTML(mobj.group('description'))
+ # Can't see the description anywhere in the UI
+ # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+ # webpage, u'description', fatal=False)
+ # if video_description: video_description = unescapeHTML(video_description)
mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract upload date')
- video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
-
- mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
- if mobj is None:
- video_uploader_id = u'anonymous'
+ if mobj:
+ video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
else:
- video_uploader_id = mobj.group('uploader_id')
+ video_upload_date = None
+ self._downloader.report_warning(u'Unable to extract upload date')
- mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract thumbnail URL')
- video_thumbnail = mobj.group('thumbnail')
+ video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
+ webpage, u'uploader id', default=u'anonymous')
+
+ video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
+ webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': video_extension,
'title': video_title,
- 'description': video_description,
+ # 'description': video_description,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'thumbnail': video_thumbnail
@@ -4584,10 +4456,9 @@ class HypemIE(InfoExtractor):
cookie = urlh.headers.get('Set-Cookie', '')
self.report_extraction(track_id)
- mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
- if mobj is None:
- raise ExtractorError(u'Unable to extrack tracks')
- html_tracks = mobj.group(1).strip()
+
+ html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+ response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
try:
track_list = json.loads(html_tracks)
track = track_list[u'tracks'][0]
@@ -4628,11 +4499,12 @@ class Vbox7IE(InfoExtractor):
video_id = mobj.group(1)
redirect_page, urlh = self._download_webpage_handle(url, video_id)
- redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
+ new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
+ redirect_url = urlh.geturl() + new_location
webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
- title = re.search(r'<title>(.*)</title>', webpage)
- title = (title.group(1)).split('/')[0].strip()
+ title = self._html_search_regex(r'<title>(.*)</title>',
+ webpage, u'title').split('/')[0].strip()
ext = "flv"
info_url = "http://vbox7.com/play/magare.do"
@@ -4652,6 +4524,88 @@ class Vbox7IE(InfoExtractor):
'thumbnail': thumbnail_url,
}]
+class GametrailersIE(InfoExtractor):
+ _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ video_id = mobj.group('id')
+ video_type = mobj.group('type')
+ webpage = self._download_webpage(url, video_id)
+ if video_type == 'full-episodes':
+ mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
+ else:
+ mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
+ mgid = self._search_regex(mgid_re, webpage, u'mgid')
+ data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
+
+ info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
+ video_id, u'Downloading video info')
+ links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
+ video_id, u'Downloading video urls info')
+
+ self.report_extraction(video_id)
+ info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
+ <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
+ <image>.*
+ <url>(?P<thumb>.*?)</url>.*
+ </image>'''
+
+ m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
+ if m_info is None:
+ raise ExtractorError(u'Unable to extract video info')
+ video_title = m_info.group('title')
+ video_description = m_info.group('description')
+ video_thumb = m_info.group('thumb')
+
+ m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
+ if m_urls is None or len(m_urls) == 0:
+ raise ExtractError(u'Unable to extrat video url')
+ # They are sorted from worst to best quality
+ video_url = m_urls[-1].group('url')
+
+ return {'url': video_url,
+ 'id': video_id,
+ 'title': video_title,
+ # Videos are actually flv not mp4
+ 'ext': 'flv',
+ 'thumbnail': video_thumb,
+ 'description': video_description,
+ }
+
+class StatigramIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group(1)
+ webpage = self._download_webpage(url, video_id)
+ video_url = self._html_search_regex(
+ r'<meta property="og:video:secure_url" content="(.+?)">',
+ webpage, u'video URL')
+ thumbnail_url = self._html_search_regex(
+ r'<meta property="og:image" content="(.+?)" />',
+ webpage, u'thumbnail URL', fatal=False)
+ html_title = self._html_search_regex(
+ r'<title>(.+?)</title>',
+ webpage, u'title')
+ title = html_title.rpartition(u' | Statigram')[0]
+ uploader = self._html_search_regex(
+ r'@(.+) \(Videos\)', title, u'uploader name', fatal=False)
+ ext = 'mp4'
+
+ return [{
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail_url,
+ 'uploader' : uploader
+ }]
+
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
@@ -4717,6 +4671,8 @@ def gen_extractors():
XHamsterIE(),
HypemIE(),
Vbox7IE(),
+ GametrailersIE(),
+ StatigramIE(),
GenericIE()
]
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 18be9f156..6f9ffba1e 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -200,7 +200,7 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='listsubtitles',
help='lists all available subtitles for the video (currently youtube only)', default=False)
video_format.add_option('--sub-format',
- action='store', dest='subtitlesformat', metavar='LANG',
+ action='store', dest='subtitlesformat', metavar='FORMAT',
help='subtitle format [srt/sbv] (default=srt) (currently youtube only)', default='srt')
video_format.add_option('--sub-lang', '--srt-lang',
action='store', dest='subtitleslang', metavar='LANG',
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 63d9d0ae5..66ae41e31 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -12,7 +12,7 @@ import sys
import traceback
import zlib
import email.utils
-import json
+import socket
import datetime
try:
@@ -154,6 +154,9 @@ def compat_ord(c):
if type(c) is int: return c
else: return ord(c)
+# This is not clearly defined otherwise
+compiled_regex_type = type(re.compile(''))
+
std_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
@@ -469,7 +472,11 @@ class ExtractorError(Exception):
"""Error during info extraction."""
def __init__(self, msg, tb=None):
""" tb, if given, is the original traceback (so that it can be printed out). """
+
+ if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
+ msg = msg + u'; please report this issue on GitHub.'
super(ExtractorError, self).__init__(msg)
+
self.traceback = tb
self.exc_info = sys.exc_info() # preserve original exception
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 1cda7fa74..7c6757efe 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2013.05.23'
+__version__ = '2013.06.21'