aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--.travis.yml1
-rw-r--r--AUTHORS106
-rw-r--r--CONTRIBUTING.md136
-rw-r--r--Makefile23
-rw-r--r--README.md227
-rwxr-xr-xdevscripts/bash-completion.py7
-rw-r--r--devscripts/buildserver.py5
-rw-r--r--devscripts/check-porn.py1
-rwxr-xr-xdevscripts/fish-completion.py2
-rwxr-xr-xdevscripts/gh-pages/add-version.py1
-rwxr-xr-xdevscripts/gh-pages/generate-download.py5
-rwxr-xr-xdevscripts/gh-pages/sign-versions.py20
-rwxr-xr-xdevscripts/gh-pages/update-copyright.py6
-rwxr-xr-xdevscripts/gh-pages/update-feed.py2
-rwxr-xr-xdevscripts/gh-pages/update-sites.py6
-rwxr-xr-xdevscripts/make_contributing.py32
-rwxr-xr-xdevscripts/make_readme.py2
-rw-r--r--devscripts/make_supportedsites.py45
-rw-r--r--devscripts/prepare_manpage.py16
-rw-r--r--devscripts/transition_helper.py40
-rw-r--r--devscripts/transition_helper_exe/setup.py12
-rw-r--r--devscripts/transition_helper_exe/youtube-dl.py102
-rwxr-xr-xdevscripts/zsh-completion.py2
-rw-r--r--docs/conf.py4
-rw-r--r--docs/supportedsites.md500
-rw-r--r--setup.cfg4
-rw-r--r--setup.py5
-rw-r--r--test/helper.py65
-rw-r--r--test/swftests/ConstArrayAccess.as18
-rw-r--r--test/swftests/ConstantInt.as12
-rw-r--r--test/swftests/DictCall.as10
-rw-r--r--test/swftests/EqualsOperator.as10
-rw-r--r--test/swftests/MemberAssignment.as22
-rw-r--r--test/swftests/NeOperator.as24
-rw-r--r--test/swftests/PrivateVoidCall.as22
-rw-r--r--test/swftests/StringBasics.as11
-rw-r--r--test/swftests/StringCharCodeAt.as11
-rw-r--r--test/swftests/StringConversion.as11
-rw-r--r--test/test_InfoExtractor.py18
-rw-r--r--test/test_YoutubeDL.py91
-rw-r--r--test/test_age_restriction.py8
-rw-r--r--test/test_all_urls.py21
-rw-r--r--test/test_compat.py46
-rw-r--r--test/test_download.py39
-rw-r--r--test/test_execution.py7
-rw-r--r--test/test_subtitles.py78
-rw-r--r--test/test_swfinterp.py5
-rw-r--r--test/test_unicode_literals.py25
-rw-r--r--test/test_utils.py140
-rw-r--r--test/test_write_annotations.py27
-rw-r--r--test/test_write_info_json.py75
-rw-r--r--test/test_youtube_lists.py7
-rw-r--r--test/test_youtube_signature.py2
-rwxr-xr-xyoutube_dl/YoutubeDL.py601
-rw-r--r--youtube_dl/__init__.py288
-rwxr-xr-xyoutube_dl/__main__.py1
-rw-r--r--youtube_dl/aes.py141
-rw-r--r--youtube_dl/cache.py9
-rw-r--r--youtube_dl/compat.py386
-rw-r--r--youtube_dl/downloader/__init__.py47
-rw-r--r--youtube_dl/downloader/common.py114
-rw-r--r--youtube_dl/downloader/external.py117
-rw-r--r--youtube_dl/downloader/f4m.py82
-rw-r--r--youtube_dl/downloader/hls.py23
-rw-r--r--youtube_dl/downloader/http.py58
-rw-r--r--youtube_dl/downloader/mplayer.py19
-rw-r--r--youtube_dl/downloader/rtmp.py36
-rw-r--r--youtube_dl/extractor/__init__.py125
-rw-r--r--youtube_dl/extractor/abc.py10
-rw-r--r--youtube_dl/extractor/abc7news.py68
-rw-r--r--youtube_dl/extractor/academicearth.py8
-rw-r--r--youtube_dl/extractor/addanime.py14
-rw-r--r--youtube_dl/extractor/adobetv.py70
-rw-r--r--youtube_dl/extractor/adultswim.py202
-rw-r--r--youtube_dl/extractor/aljazeera.py35
-rw-r--r--youtube_dl/extractor/allocine.py12
-rw-r--r--youtube_dl/extractor/alphaporno.py77
-rw-r--r--youtube_dl/extractor/aol.py48
-rw-r--r--youtube_dl/extractor/aparat.py14
-rw-r--r--youtube_dl/extractor/appletrailers.py12
-rw-r--r--youtube_dl/extractor/archiveorg.py52
-rw-r--r--youtube_dl/extractor/ard.py8
-rw-r--r--youtube_dl/extractor/arte.py126
-rw-r--r--youtube_dl/extractor/atresplayer.py163
-rw-r--r--youtube_dl/extractor/atttechchannel.py55
-rw-r--r--youtube_dl/extractor/audiomack.py139
-rw-r--r--youtube_dl/extractor/auengine.py55
-rw-r--r--youtube_dl/extractor/azubu.py93
-rw-r--r--youtube_dl/extractor/bambuser.py11
-rw-r--r--youtube_dl/extractor/bandcamp.py28
-rw-r--r--youtube_dl/extractor/bbccouk.py178
-rw-r--r--youtube_dl/extractor/beeg.py2
-rw-r--r--youtube_dl/extractor/behindkink.py29
-rw-r--r--youtube_dl/extractor/bet.py107
-rw-r--r--youtube_dl/extractor/bild.py39
-rw-r--r--youtube_dl/extractor/bilibili.py72
-rw-r--r--youtube_dl/extractor/bliptv.py85
-rw-r--r--youtube_dl/extractor/bpb.py37
-rw-r--r--youtube_dl/extractor/br.py2
-rw-r--r--youtube_dl/extractor/breakcom.py1
-rw-r--r--youtube_dl/extractor/brightcove.py43
-rw-r--r--youtube_dl/extractor/buzzfeed.py74
-rw-r--r--youtube_dl/extractor/byutv.py8
-rw-r--r--youtube_dl/extractor/canalplus.py89
-rw-r--r--youtube_dl/extractor/cbs.py2
-rw-r--r--youtube_dl/extractor/cbsnews.py2
-rw-r--r--youtube_dl/extractor/ceskatelevize.py139
-rw-r--r--youtube_dl/extractor/channel9.py82
-rw-r--r--youtube_dl/extractor/cinchcast.py52
-rw-r--r--youtube_dl/extractor/cinemassacre.py100
-rw-r--r--youtube_dl/extractor/clipfish.py4
-rw-r--r--youtube_dl/extractor/cliphunter.py65
-rw-r--r--youtube_dl/extractor/clipsyndicate.py1
-rw-r--r--youtube_dl/extractor/cloudy.py10
-rw-r--r--youtube_dl/extractor/cnet.py35
-rw-r--r--youtube_dl/extractor/cnn.py50
-rw-r--r--youtube_dl/extractor/collegehumor.py79
-rw-r--r--youtube_dl/extractor/collegerama.py92
-rw-r--r--youtube_dl/extractor/comcarcoff.py57
-rw-r--r--youtube_dl/extractor/comedycentral.py31
-rw-r--r--youtube_dl/extractor/common.py237
-rw-r--r--youtube_dl/extractor/commonmistakes.py29
-rw-r--r--youtube_dl/extractor/condenast.py6
-rw-r--r--youtube_dl/extractor/cracked.py4
-rw-r--r--youtube_dl/extractor/crunchyroll.py120
-rw-r--r--youtube_dl/extractor/cspan.py1
-rw-r--r--youtube_dl/extractor/d8.py25
-rw-r--r--youtube_dl/extractor/dailymotion.py20
-rw-r--r--youtube_dl/extractor/daum.py4
-rw-r--r--youtube_dl/extractor/dbtv.py3
-rw-r--r--youtube_dl/extractor/defense.py10
-rw-r--r--youtube_dl/extractor/discovery.py52
-rw-r--r--youtube_dl/extractor/dotsub.py2
-rw-r--r--youtube_dl/extractor/drbonanza.py131
-rw-r--r--youtube_dl/extractor/dropbox.py27
-rw-r--r--youtube_dl/extractor/drtv.py9
-rw-r--r--youtube_dl/extractor/dvtv.py125
-rw-r--r--youtube_dl/extractor/ebaumsworld.py5
-rw-r--r--youtube_dl/extractor/echomsk.py46
-rw-r--r--youtube_dl/extractor/ehow.py11
-rw-r--r--youtube_dl/extractor/eighttracks.py31
-rw-r--r--youtube_dl/extractor/ellentv.py35
-rw-r--r--youtube_dl/extractor/elpais.py6
-rw-r--r--youtube_dl/extractor/engadget.py8
-rw-r--r--youtube_dl/extractor/eporner.py6
-rw-r--r--youtube_dl/extractor/eroprofile.py45
-rw-r--r--youtube_dl/extractor/escapist.py5
-rw-r--r--youtube_dl/extractor/everyonesmixtape.py4
-rw-r--r--youtube_dl/extractor/extremetube.py8
-rw-r--r--youtube_dl/extractor/facebook.py26
-rw-r--r--youtube_dl/extractor/faz.py39
-rw-r--r--youtube_dl/extractor/fc2.py62
-rw-r--r--youtube_dl/extractor/firedrive.py11
-rw-r--r--youtube_dl/extractor/firsttv.py6
-rw-r--r--youtube_dl/extractor/fivemin.py17
-rw-r--r--youtube_dl/extractor/fktv.py64
-rw-r--r--youtube_dl/extractor/flickr.py6
-rw-r--r--youtube_dl/extractor/folketinget.py75
-rw-r--r--youtube_dl/extractor/fourtube.py96
-rw-r--r--youtube_dl/extractor/foxgay.py48
-rw-r--r--youtube_dl/extractor/foxnews.py94
-rw-r--r--youtube_dl/extractor/franceculture.py2
-rw-r--r--youtube_dl/extractor/francetv.py34
-rw-r--r--youtube_dl/extractor/freevideo.py38
-rw-r--r--youtube_dl/extractor/funnyordie.py8
-rw-r--r--youtube_dl/extractor/gamekings.py2
-rw-r--r--youtube_dl/extractor/gameone.py63
-rw-r--r--youtube_dl/extractor/gamespot.py19
-rw-r--r--youtube_dl/extractor/gamestar.py6
-rw-r--r--youtube_dl/extractor/gdcvault.py5
-rw-r--r--youtube_dl/extractor/generic.py263
-rw-r--r--youtube_dl/extractor/giantbomb.py81
-rw-r--r--youtube_dl/extractor/giga.py101
-rw-r--r--youtube_dl/extractor/glide.py40
-rw-r--r--youtube_dl/extractor/globo.py10
-rw-r--r--youtube_dl/extractor/goldenmoustache.py48
-rw-r--r--youtube_dl/extractor/golem.py4
-rw-r--r--youtube_dl/extractor/googlesearch.py2
-rw-r--r--youtube_dl/extractor/gorillavid.py43
-rw-r--r--youtube_dl/extractor/goshgay.py72
-rw-r--r--youtube_dl/extractor/grooveshark.py7
-rw-r--r--youtube_dl/extractor/groupon.py50
-rw-r--r--youtube_dl/extractor/hark.py48
-rw-r--r--youtube_dl/extractor/hearthisat.py117
-rw-r--r--youtube_dl/extractor/heise.py60
-rw-r--r--youtube_dl/extractor/hellporno.py71
-rw-r--r--youtube_dl/extractor/helsinki.py37
-rw-r--r--youtube_dl/extractor/hitbox.py166
-rw-r--r--youtube_dl/extractor/hornbunny.py2
-rw-r--r--youtube_dl/extractor/hostingbulk.py8
-rw-r--r--youtube_dl/extractor/hotnewhiphop.py22
-rw-r--r--youtube_dl/extractor/howcast.py6
-rw-r--r--youtube_dl/extractor/howstuffworks.py127
-rw-r--r--youtube_dl/extractor/huffpost.py20
-rw-r--r--youtube_dl/extractor/hypem.py11
-rw-r--r--youtube_dl/extractor/iconosquare.py10
-rw-r--r--youtube_dl/extractor/ign.py8
-rw-r--r--youtube_dl/extractor/imdb.py17
-rw-r--r--youtube_dl/extractor/infoq.py7
-rw-r--r--youtube_dl/extractor/instagram.py4
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py26
-rw-r--r--youtube_dl/extractor/iprima.py6
-rw-r--r--youtube_dl/extractor/ivi.py16
-rw-r--r--youtube_dl/extractor/izlesene.py15
-rw-r--r--youtube_dl/extractor/jadorecettepub.py1
-rw-r--r--youtube_dl/extractor/jeuxvideo.py4
-rw-r--r--youtube_dl/extractor/jukebox.py6
-rw-r--r--youtube_dl/extractor/kankan.py2
-rw-r--r--youtube_dl/extractor/karaoketv.py40
-rw-r--r--youtube_dl/extractor/keek.py30
-rw-r--r--youtube_dl/extractor/keezmovies.py7
-rw-r--r--youtube_dl/extractor/khanacademy.py4
-rw-r--r--youtube_dl/extractor/kickstarter.py29
-rw-r--r--youtube_dl/extractor/kontrtube.py25
-rw-r--r--youtube_dl/extractor/krasview.py22
-rw-r--r--youtube_dl/extractor/ku6.py11
-rw-r--r--youtube_dl/extractor/laola1tv.py77
-rw-r--r--youtube_dl/extractor/lifenews.py4
-rw-r--r--youtube_dl/extractor/liveleak.py29
-rw-r--r--youtube_dl/extractor/livestream.py12
-rw-r--r--youtube_dl/extractor/lnkgo.py124
-rw-r--r--youtube_dl/extractor/lrt.py12
-rw-r--r--youtube_dl/extractor/lynda.py16
-rw-r--r--youtube_dl/extractor/m6.py4
-rw-r--r--youtube_dl/extractor/mailru.py29
-rw-r--r--youtube_dl/extractor/malemotion.py39
-rw-r--r--youtube_dl/extractor/mdr.py2
-rw-r--r--youtube_dl/extractor/metacafe.py10
-rw-r--r--youtube_dl/extractor/metacritic.py4
-rw-r--r--youtube_dl/extractor/minhateca.py72
-rw-r--r--youtube_dl/extractor/mit.py19
-rw-r--r--youtube_dl/extractor/mitele.py26
-rw-r--r--youtube_dl/extractor/mixcloud.py6
-rw-r--r--youtube_dl/extractor/mlb.py10
-rw-r--r--youtube_dl/extractor/moevideo.py9
-rw-r--r--youtube_dl/extractor/mofosex.py6
-rw-r--r--youtube_dl/extractor/mojvideo.py2
-rw-r--r--youtube_dl/extractor/moniker.py9
-rw-r--r--youtube_dl/extractor/mooshare.py18
-rw-r--r--youtube_dl/extractor/motherless.py56
-rw-r--r--youtube_dl/extractor/motorsport.py64
-rw-r--r--youtube_dl/extractor/movieclips.py4
-rw-r--r--youtube_dl/extractor/moviezine.py2
-rw-r--r--youtube_dl/extractor/movshare.py2
-rw-r--r--youtube_dl/extractor/mpora.py2
-rw-r--r--youtube_dl/extractor/mtv.py34
-rw-r--r--youtube_dl/extractor/muenchentv.py1
-rw-r--r--youtube_dl/extractor/musicplayon.py2
-rw-r--r--youtube_dl/extractor/muzu.py87
-rw-r--r--youtube_dl/extractor/myspace.py128
-rw-r--r--youtube_dl/extractor/myspass.py8
-rw-r--r--youtube_dl/extractor/myvideo.py22
-rw-r--r--youtube_dl/extractor/myvidster.py29
-rw-r--r--youtube_dl/extractor/naver.py20
-rw-r--r--youtube_dl/extractor/nba.py18
-rw-r--r--youtube_dl/extractor/nbc.py73
-rw-r--r--youtube_dl/extractor/ndr.py4
-rw-r--r--youtube_dl/extractor/ndtv.py4
-rw-r--r--youtube_dl/extractor/nerdcubed.py35
-rw-r--r--youtube_dl/extractor/netzkino.py86
-rw-r--r--youtube_dl/extractor/newgrounds.py4
-rw-r--r--youtube_dl/extractor/newstube.py2
-rw-r--r--youtube_dl/extractor/nfb.py22
-rw-r--r--youtube_dl/extractor/nfl.py4
-rw-r--r--youtube_dl/extractor/nhl.py61
-rw-r--r--youtube_dl/extractor/niconico.py47
-rw-r--r--youtube_dl/extractor/ninegag.py9
-rw-r--r--youtube_dl/extractor/noco.py70
-rw-r--r--youtube_dl/extractor/normalboots.py10
-rw-r--r--youtube_dl/extractor/nosvideo.py7
-rw-r--r--youtube_dl/extractor/novamov.py6
-rw-r--r--youtube_dl/extractor/nowvideo.py4
-rw-r--r--youtube_dl/extractor/npo.py210
-rw-r--r--youtube_dl/extractor/nrk.py166
-rw-r--r--youtube_dl/extractor/ntv.py4
-rw-r--r--youtube_dl/extractor/nuvid.py11
-rw-r--r--youtube_dl/extractor/nytimes.py2
-rw-r--r--youtube_dl/extractor/ooyala.py5
-rw-r--r--youtube_dl/extractor/openfilm.py70
-rw-r--r--youtube_dl/extractor/orf.py70
-rw-r--r--youtube_dl/extractor/pbs.py36
-rw-r--r--youtube_dl/extractor/phoenix.py31
-rw-r--r--youtube_dl/extractor/photobucket.py7
-rw-r--r--youtube_dl/extractor/played.py13
-rw-r--r--youtube_dl/extractor/playfm.py4
-rw-r--r--youtube_dl/extractor/playvid.py27
-rw-r--r--youtube_dl/extractor/podomatic.py1
-rw-r--r--youtube_dl/extractor/pornhd.py20
-rw-r--r--youtube_dl/extractor/pornhub.py31
-rw-r--r--youtube_dl/extractor/pornotube.py102
-rw-r--r--youtube_dl/extractor/promptfile.py14
-rw-r--r--youtube_dl/extractor/prosiebensat1.py64
-rw-r--r--youtube_dl/extractor/quickvid.py53
-rw-r--r--youtube_dl/extractor/radiobremen.py63
-rw-r--r--youtube_dl/extractor/radiode.py55
-rw-r--r--youtube_dl/extractor/rai.py6
-rw-r--r--youtube_dl/extractor/rbmaradio.py2
-rw-r--r--youtube_dl/extractor/redtube.py23
-rw-r--r--youtube_dl/extractor/restudy.py40
-rw-r--r--youtube_dl/extractor/ringtv.py1
-rw-r--r--youtube_dl/extractor/ro220.py46
-rw-r--r--youtube_dl/extractor/rte.py62
-rw-r--r--youtube_dl/extractor/rtlnl.py14
-rw-r--r--youtube_dl/extractor/rtlnow.py6
-rw-r--r--youtube_dl/extractor/rtp.py60
-rw-r--r--youtube_dl/extractor/rts.py52
-rw-r--r--youtube_dl/extractor/rtve.py1
-rw-r--r--youtube_dl/extractor/ruhd.py15
-rw-r--r--youtube_dl/extractor/rutube.py48
-rw-r--r--youtube_dl/extractor/rutv.py2
-rw-r--r--youtube_dl/extractor/sbs.py3
-rw-r--r--youtube_dl/extractor/scivee.py2
-rw-r--r--youtube_dl/extractor/screencast.py13
-rw-r--r--youtube_dl/extractor/screencastomatic.py49
-rw-r--r--youtube_dl/extractor/screenwavemedia.py178
-rw-r--r--youtube_dl/extractor/servingsys.py2
-rw-r--r--youtube_dl/extractor/sexu.py61
-rw-r--r--youtube_dl/extractor/sexykarma.py5
-rw-r--r--youtube_dl/extractor/shared.py40
-rw-r--r--youtube_dl/extractor/sharesix.py4
-rw-r--r--youtube_dl/extractor/sina.py4
-rw-r--r--youtube_dl/extractor/slideshare.py8
-rw-r--r--youtube_dl/extractor/slutload.py2
-rw-r--r--youtube_dl/extractor/smotri.py219
-rw-r--r--youtube_dl/extractor/sockshare.py13
-rw-r--r--youtube_dl/extractor/sohu.py112
-rw-r--r--youtube_dl/extractor/soulanime.py80
-rw-r--r--youtube_dl/extractor/soundcloud.py33
-rw-r--r--youtube_dl/extractor/space.py3
-rw-r--r--youtube_dl/extractor/spankwire.py8
-rw-r--r--youtube_dl/extractor/spiegel.py114
-rw-r--r--youtube_dl/extractor/spiegeltv.py27
-rw-r--r--youtube_dl/extractor/sport5.py2
-rw-r--r--youtube_dl/extractor/sportbox.py5
-rw-r--r--youtube_dl/extractor/sportdeutschland.py8
-rw-r--r--youtube_dl/extractor/srmediathek.py43
-rw-r--r--youtube_dl/extractor/stanfordoc.py88
-rw-r--r--youtube_dl/extractor/streamcloud.py12
-rw-r--r--youtube_dl/extractor/streamcz.py78
-rw-r--r--youtube_dl/extractor/streetvoice.py51
-rw-r--r--youtube_dl/extractor/subtitles.py17
-rw-r--r--youtube_dl/extractor/sunporno.py18
-rw-r--r--youtube_dl/extractor/swrmediathek.py4
-rw-r--r--youtube_dl/extractor/syfy.py1
-rw-r--r--youtube_dl/extractor/sztvhu.py24
-rw-r--r--youtube_dl/extractor/tagesschau.py111
-rw-r--r--youtube_dl/extractor/tapely.py7
-rw-r--r--youtube_dl/extractor/tass.py62
-rw-r--r--youtube_dl/extractor/teachertube.py6
-rw-r--r--youtube_dl/extractor/teamcoco.py35
-rw-r--r--youtube_dl/extractor/ted.py24
-rw-r--r--youtube_dl/extractor/telebruxelles.py60
-rw-r--r--youtube_dl/extractor/telecinco.py19
-rw-r--r--youtube_dl/extractor/teletask.py53
-rw-r--r--youtube_dl/extractor/tenplay.py1
-rw-r--r--youtube_dl/extractor/testtube.py72
-rw-r--r--youtube_dl/extractor/tf1.py26
-rw-r--r--youtube_dl/extractor/theplatform.py59
-rw-r--r--youtube_dl/extractor/thisav.py14
-rw-r--r--youtube_dl/extractor/tinypic.py32
-rw-r--r--youtube_dl/extractor/tlc.py7
-rw-r--r--youtube_dl/extractor/tmz.py32
-rw-r--r--youtube_dl/extractor/tnaflix.py8
-rw-r--r--youtube_dl/extractor/traileraddict.py8
-rw-r--r--youtube_dl/extractor/trilulilu.py43
-rw-r--r--youtube_dl/extractor/trutube.py38
-rw-r--r--youtube_dl/extractor/tube8.py4
-rw-r--r--youtube_dl/extractor/tudou.py27
-rw-r--r--youtube_dl/extractor/tumblr.py35
-rw-r--r--youtube_dl/extractor/tunein.py106
-rw-r--r--youtube_dl/extractor/tutv.py8
-rw-r--r--youtube_dl/extractor/tvigle.py33
-rw-r--r--youtube_dl/extractor/tvp.py155
-rw-r--r--youtube_dl/extractor/tvplay.py14
-rw-r--r--youtube_dl/extractor/twentyfourvideo.py109
-rw-r--r--youtube_dl/extractor/twitch.py429
-rw-r--r--youtube_dl/extractor/udemy.py75
-rw-r--r--youtube_dl/extractor/urort.py47
-rw-r--r--youtube_dl/extractor/ustream.py13
-rw-r--r--youtube_dl/extractor/vbox7.py18
-rw-r--r--youtube_dl/extractor/veehd.py53
-rw-r--r--youtube_dl/extractor/veoh.py4
-rw-r--r--youtube_dl/extractor/vesti.py4
-rw-r--r--youtube_dl/extractor/vevo.py6
-rw-r--r--youtube_dl/extractor/vgtv.py10
-rw-r--r--youtube_dl/extractor/vh1.py5
-rw-r--r--youtube_dl/extractor/vice.py37
-rw-r--r--youtube_dl/extractor/viddler.py108
-rw-r--r--youtube_dl/extractor/videobam.py2
-rw-r--r--youtube_dl/extractor/videodetective.py9
-rw-r--r--youtube_dl/extractor/videofyme.py60
-rw-r--r--youtube_dl/extractor/videomega.py37
-rw-r--r--youtube_dl/extractor/videopremium.py41
-rw-r--r--youtube_dl/extractor/videott.py11
-rw-r--r--youtube_dl/extractor/videoweed.py2
-rw-r--r--youtube_dl/extractor/vidzi.py32
-rw-r--r--youtube_dl/extractor/vier.py118
-rw-r--r--youtube_dl/extractor/viki.py4
-rw-r--r--youtube_dl/extractor/vimeo.py10
-rw-r--r--youtube_dl/extractor/vimple.py23
-rw-r--r--youtube_dl/extractor/vine.py54
-rw-r--r--youtube_dl/extractor/vk.py109
-rw-r--r--youtube_dl/extractor/vodlocker.py6
-rw-r--r--youtube_dl/extractor/vrt.py95
-rw-r--r--youtube_dl/extractor/vube.py4
-rw-r--r--youtube_dl/extractor/vuclip.py9
-rw-r--r--youtube_dl/extractor/washingtonpost.py11
-rw-r--r--youtube_dl/extractor/wdr.py40
-rw-r--r--youtube_dl/extractor/webofstories.py102
-rw-r--r--youtube_dl/extractor/weibo.py2
-rw-r--r--youtube_dl/extractor/wimp.py2
-rw-r--r--youtube_dl/extractor/wistia.py8
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py1
-rw-r--r--youtube_dl/extractor/wrzuta.py21
-rw-r--r--youtube_dl/extractor/xbef.py10
-rw-r--r--youtube_dl/extractor/xboxclips.py26
-rw-r--r--youtube_dl/extractor/xhamster.py19
-rw-r--r--youtube_dl/extractor/xminus.py76
-rw-r--r--youtube_dl/extractor/xnxx.py15
-rw-r--r--youtube_dl/extractor/xtube.py74
-rw-r--r--youtube_dl/extractor/xvideos.py22
-rw-r--r--youtube_dl/extractor/xxxymovies.py81
-rw-r--r--youtube_dl/extractor/yahoo.py61
-rw-r--r--youtube_dl/extractor/yesjapan.py62
-rw-r--r--youtube_dl/extractor/ynet.py6
-rw-r--r--youtube_dl/extractor/youjizz.py34
-rw-r--r--youtube_dl/extractor/youku.py20
-rw-r--r--youtube_dl/extractor/youporn.py19
-rw-r--r--youtube_dl/extractor/youtube.py673
-rw-r--r--youtube_dl/extractor/zdf.py190
-rw-r--r--youtube_dl/extractor/zingmp3.py107
-rw-r--r--youtube_dl/jsinterp.py2
-rw-r--r--youtube_dl/options.py173
-rw-r--r--youtube_dl/postprocessor/__init__.py26
-rw-r--r--youtube_dl/postprocessor/atomicparsley.py5
-rw-r--r--youtube_dl/postprocessor/common.py2
-rw-r--r--youtube_dl/postprocessor/execafterdownload.py9
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py195
-rw-r--r--youtube_dl/postprocessor/xattrpp.py7
-rw-r--r--youtube_dl/swfinterp.py264
-rw-r--r--youtube_dl/update.py113
-rw-r--r--youtube_dl/utils.py873
-rw-r--r--youtube_dl/version.py3
444 files changed, 15219 insertions, 5487 deletions
diff --git a/.gitignore b/.gitignore
index 86312d4e4..0422adf44 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,3 +31,5 @@ updates_key.pem
test/testdata
.tox
youtube-dl.zsh
+.idea
+.idea/* \ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index c6cc7a994..f14014414 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,7 +9,6 @@ notifications:
email:
- filippo.valsorda@gmail.com
- phihag@phihag.de
- - jaime.marquinez.ferrandiz+travis@gmail.com
- yasoob.khld@gmail.com
# irc:
# channels:
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 000000000..b8bf3cb6f
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,106 @@
+Ricardo Garcia Gonzalez
+Danny Colligan
+Benjamin Johnson
+Vasyl' Vavrychuk
+Witold Baryluk
+Paweł Paprota
+Gergely Imreh
+Rogério Brito
+Philipp Hagemeister
+Sören Schulze
+Kevin Ngo
+Ori Avtalion
+shizeeg
+Filippo Valsorda
+Christian Albrecht
+Dave Vasilevsky
+Jaime Marquínez Ferrándiz
+Jeff Crouse
+Osama Khalid
+Michael Walter
+M. Yasoob Ullah Khalid
+Julien Fraichard
+Johny Mo Swag
+Axel Noack
+Albert Kim
+Pierre Rudloff
+Huarong Huo
+Ismael Mejía
+Steffan 'Ruirize' James
+Andras Elso
+Jelle van der Waa
+Marcin Cieślak
+Anton Larionov
+Takuya Tsuchida
+Sergey M.
+Michael Orlitzky
+Chris Gahan
+Saimadhav Heblikar
+Mike Col
+Oleg Prutz
+pulpe
+Andreas Schmitz
+Michael Kaiser
+Niklas Laxström
+David Triendl
+Anthony Weems
+David Wagner
+Juan C. Olivares
+Mattias Harrysson
+phaer
+Sainyam Kapoor
+Nicolas Évrard
+Jason Normore
+Hoje Lee
+Adam Thalhammer
+Georg Jähnig
+Ralf Haring
+Koki Takahashi
+Ariset Llerena
+Adam Malcontenti-Wilson
+Tobias Bell
+Naglis Jonaitis
+Charles Chen
+Hassaan Ali
+Dobrosław Żybort
+David Fabijan
+Sebastian Haas
+Alexander Kirk
+Erik Johnson
+Keith Beckman
+Ole Ernst
+Aaron McDaniel (mcd1992)
+Magnus Kolstad
+Hari Padmanaban
+Carlos Ramos
+5moufl
+lenaten
+Dennis Scheiba
+Damon Timm
+winwon
+Xavier Beynon
+Gabriel Schubiner
+xantares
+Jan Matějka
+Mauroy Sébastien
+William Sewell
+Dao Hoang Son
+Oskar Jauch
+Matthew Rayfield
+t0mm0
+Tithen-Firion
+Zack Fernandes
+cryptonaut
+Adrian Kretz
+Mathias Rav
+Petr Kutalek
+Will Glynn
+Max Reimann
+Cédric Luthi
+Thijs Vermeir
+Joel Leclerc
+Christopher Krooss
+Ondřej Caletka
+Dinesh S
+Johan K. Jensen
+Yen Chi Hsuan
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..7917abfc6
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,136 @@
+Please include the full output of the command when run with `--verbose`. The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever.
+
+Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist):
+
+### Is the description of the issue itself sufficient?
+
+We often get issue reports that we cannot really decipher. While in most cases we eventually get the required information after asking back multiple times, this poses an unnecessary drain on our resources. Many contributors, including myself, are also not native speakers, so we may misread some parts.
+
+So please elaborate on what feature you are requesting, or what bug you want to be fixed. Make sure that it's obvious
+
+- What the problem is
+- How it could be fixed
+- How your proposed solution would look like
+
+If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over.
+
+For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information.
+
+Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.
+
+### Are you using the latest version?
+
+Before reporting any issue, type youtube-dl -U. This should report that you're up-to-date. About 20% of the reports we receive are already fixed, but people are using outdated versions. This goes for feature requests as well.
+
+### Is the issue already documented?
+
+Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or at https://github.com/rg3/youtube-dl/search?type=Issues . If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity.
+
+### Why are existing options not enough?
+
+Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem.
+
+### Is there enough context in your bug report?
+
+People want to solve problems, and often think they do us a favor by breaking down their larger problems (e.g. wanting to skip already downloaded files) to a specific request (e.g. requesting us to look whether the file exists before downloading the info page). However, what often happens is that they break down the problem into two steps: One simple, and one impossible (or extremely complicated one).
+
+We are then presented with a very complicated request when the original problem could be solved far easier, e.g. by recording the downloaded video IDs in a separate file. To avoid this, you must include the greater context where it is non-obvious. In particular, every feature request that does not consist of adding support for a new site should contain a use case scenario that explains in what situation the missing feature would be useful.
+
+### Does the issue involve one problem, and one problem only?
+
+Some of our users seem to think there is a limit of issues they can or should open. There is no limit of issues they can or should open. While it may seem appealing to be able to dump all your issues into one ticket, that means that someone who solves one of your issues cannot mark the issue as closed. Typically, reporting a bunch of issues leads to the ticket lingering since nobody wants to attack that behemoth, until someone mercifully splits the issue into multiple ones.
+
+In particular, every site support request issue should only pertain to services at one site (generally under a common domain, but always using the same backend technology). Do not request support for vimeo user videos, Whitehouse podcasts, and Google Plus pages in the same issue. Also, make sure that you don't post bug reports alongside feature requests. As a rule of thumb, a feature request does not include outputs of youtube-dl that are not immediately related to the feature at hand. Do not post reports of a network error alongside the request for a new video service.
+
+### Is anyone going to need the feature?
+
+Only post features that you (or an incapacitated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them.
+
+### Is your question about youtube-dl?
+
+It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different or even the reporter's own application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug.
+
+# DEVELOPER INSTRUCTIONS
+
+Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
+
+To run youtube-dl as a developer, you don't need to build anything either. Simply execute
+
+ python -m youtube_dl
+
+To run the test, simply invoke your favorite test runner, or execute a test file directly; any of the following work:
+
+ python -m unittest discover
+ python test/test_download.py
+ nosetests
+
+If you want to create a build of youtube-dl yourself, you'll need
+
+* python
+* make
+* pandoc
+* zip
+* nosetests
+
+### Adding support for a new site
+
+If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
+
+1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
+2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
+3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+ ```python
+ # coding: utf-8
+ from __future__ import unicode_literals
+
+ from .common import InfoExtractor
+
+
+ class YourExtractorIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://yourextractor.com/watch/42',
+ 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
+ 'info_dict': {
+ 'id': '42',
+ 'ext': 'mp4',
+ 'title': 'Video title goes here',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ # TODO more properties, either as:
+ # * A value
+ # * MD5 checksum; start the string with md5:
+ # * A regular expression; start the string with re:
+ # * Any Python type (for example int or float)
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ # TODO more code goes here, for example ...
+ title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ # TODO more properties (see youtube_dl/extractor/common.py)
+ }
+ ```
+5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
+7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
+8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
+9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+
+ $ git add youtube_dl/extractor/__init__.py
+ $ git add youtube_dl/extractor/yourextractor.py
+ $ git commit -m '[yourextractor] Add new extractor'
+ $ git push origin yourextractor
+
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+
+In any case, thank you very much for your contributions!
+
diff --git a/Makefile b/Makefile
index 2da92863f..578079879 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
-all: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish
+all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
clean:
- rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish
+ rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json CONTRIBUTING.md.tmp
cleanall: clean
rm -f youtube-dl youtube-dl.exe
@@ -35,13 +35,22 @@ install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtu
install -d $(DESTDIR)$(SYSCONFDIR)/fish/completions
install -m 644 youtube-dl.fish $(DESTDIR)$(SYSCONFDIR)/fish/completions/youtube-dl.fish
+codetest:
+ flake8 .
+
test:
#nosetests --with-coverage --cover-package=youtube_dl --cover-html --verbose --processes 4 test
nosetests --verbose test
+ $(MAKE) codetest
+
+ot: offlinetest
+
+offlinetest: codetest
+ nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists
tar: youtube-dl.tar.gz
-.PHONY: all clean install test tar bash-completion pypi-files zsh-completion fish-completion
+.PHONY: all clean install test tar bash-completion pypi-files zsh-completion fish-completion ot offlinetest codetest supportedsites
pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish
@@ -54,7 +63,13 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py
chmod a+x youtube-dl
README.md: youtube_dl/*.py youtube_dl/*/*.py
- COLUMNS=80 python -m youtube_dl --help | python devscripts/make_readme.py
+ COLUMNS=80 python youtube_dl/__main__.py --help | python devscripts/make_readme.py
+
+CONTRIBUTING.md: README.md
+ python devscripts/make_contributing.py README.md CONTRIBUTING.md
+
+supportedsites:
+ python devscripts/make_supportedsites.py docs/supportedsites.md
README.txt: README.md
pandoc -f markdown -t plain README.md -o README.txt
diff --git a/README.md b/README.md
index 90ba928c3..36b87444e 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,15 @@
youtube-dl - download videos from youtube.com or other video platforms
-# SYNOPSIS
-**youtube-dl** [OPTIONS] URL [URL...]
+- [INSTALLATION](#installation)
+- [DESCRIPTION](#description)
+- [OPTIONS](#options)
+- [CONFIGURATION](#configuration)
+- [OUTPUT TEMPLATE](#output-template)
+- [VIDEO SELECTION](#video-selection)
+- [FAQ](#faq)
+- [DEVELOPER INSTRUCTIONS](#developer-instructions)
+- [BUGS](#bugs)
+- [COPYRIGHT](#copyright)
# INSTALLATION
@@ -30,10 +38,12 @@ Alternatively, refer to the developer instructions below for how to check out an
# DESCRIPTION
**youtube-dl** is a small command-line program to download videos from
YouTube.com and a few more sites. It requires the Python interpreter, version
-2.6, 2.7, or 3.3+, and it is not platform specific. It should work on
+2.6, 2.7, or 3.2+, and it is not platform specific. It should work on
your Unix box, on Windows or on Mac OS X. It is released to the public domain,
which means you can modify it, redistribute it or use it however you like.
+ youtube-dl [OPTIONS] URL [URL...]
+
# OPTIONS
-h, --help print this help text and exit
--version print program version and exit
@@ -50,10 +60,6 @@ which means you can modify it, redistribute it or use it however you like.
they would handle
--extractor-descriptions Output descriptions of all supported
extractors
- --proxy URL Use the specified HTTP/HTTPS proxy. Pass in
- an empty string (--proxy "") for direct
- connection
- --socket-timeout None Time to wait before giving up, in seconds
--default-search PREFIX Use this prefix for unqualified URLs. For
example "gvsearch2:" downloads two videos
from google videos for youtube-dl "large
@@ -65,10 +71,24 @@ which means you can modify it, redistribute it or use it however you like.
this is not possible instead of searching.
--ignore-config Do not read configuration files. When given
in the global configuration file /etc
- /youtube-dl.conf: do not read the user
- configuration in ~/.config/youtube-dl.conf
- (%APPDATA%/youtube-dl/config.txt on
- Windows)
+ /youtube-dl.conf: Do not read the user
+ configuration in ~/.config/youtube-
+ dl/config (%APPDATA%/youtube-dl/config.txt
+ on Windows)
+ --flat-playlist Do not extract the videos of a playlist,
+ only list them.
+
+## Network Options:
+ --proxy URL Use the specified HTTP/HTTPS proxy. Pass in
+ an empty string (--proxy "") for direct
+ connection
+ --socket-timeout SECONDS Time to wait before giving up, in seconds
+ --source-address IP Client-side IP address to bind to
+ (experimental)
+ -4, --force-ipv4 Make all connections via IPv4
+ (experimental)
+ -6, --force-ipv6 Make all connections via IPv6
+ (experimental)
## Video Selection:
--playlist-start NUMBER playlist video to start at (default is 1)
@@ -91,7 +111,8 @@ which means you can modify it, redistribute it or use it however you like.
COUNT views
--max-views COUNT Do not download any videos with more than
COUNT views
- --no-playlist download only the currently playing video
+ --no-playlist If the URL refers to a video and a
+ playlist, download only the video.
--age-limit YEARS download only videos suitable for the given
age
--download-archive FILE Download only videos not listed in the
@@ -110,12 +131,12 @@ which means you can modify it, redistribute it or use it however you like.
size. By default, the buffer size is
automatically resized from an initial value
of SIZE.
+ --playlist-reverse Download playlist videos in reverse order
## Filesystem Options:
-a, --batch-file FILE file containing URLs to download ('-' for
stdin)
--id use only video ID in file name
- -A, --auto-number number downloaded files starting from 00000
-o, --output TEMPLATE output filename template. Use %(title)s to
get the title, %(uploader)s for the
uploader name, %(uploader_id)s for the
@@ -129,17 +150,19 @@ which means you can modify it, redistribute it or use it however you like.
%(upload_date)s for the upload date
(YYYYMMDD), %(extractor)s for the provider
(youtube, metacafe, etc), %(id)s for the
- video id, %(playlist)s for the playlist the
+ video id, %(playlist_title)s,
+ %(playlist_id)s, or %(playlist)s (=title if
+ present, ID otherwise) for the playlist the
video is in, %(playlist_index)s for the
- position in the playlist and %% for a
- literal percent. %(height)s and %(width)s
- for the width and height of the video
- format. %(resolution)s for a textual
+ position in the playlist. %(height)s and
+ %(width)s for the width and height of the
+ video format. %(resolution)s for a textual
description of the resolution of the video
- format. Use - to output to stdout. Can also
- be used to download to a different
- directory, for example with -o '/my/downloa
- ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
+ format. %% for a literal percent. Use - to
+ output to stdout. Can also be used to
+ download to a different directory, for
+ example with -o '/my/downloads/%(uploader)s
+ /%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in
%(autonumber)s when it is present in output
filename template or --auto-number option
@@ -147,6 +170,9 @@ which means you can modify it, redistribute it or use it however you like.
--restrict-filenames Restrict filenames to only ASCII
characters, and avoid "&" and spaces in
filenames
+ -A, --auto-number [deprecated; use -o
+ "%(autonumber)s-%(title)s.%(ext)s" ] number
+ downloaded files starting from 00000
-t, --title [deprecated] use title in file name
(default)
-l, --literal [deprecated] alias of --title
@@ -197,6 +223,12 @@ which means you can modify it, redistribute it or use it however you like.
-j, --dump-json simulate, quiet but print JSON information.
See --output for a description of available
keys.
+ -J, --dump-single-json simulate, quiet but print JSON information
+ for each command-line argument. If the URL
+ refers to a playlist, dump the whole
+ playlist information in a single line.
+ --print-json Be quiet and print the video information as
+ JSON (video is still being downloaded).
--newline output progress bar as new lines
--no-progress do not print progress bar
--console-title display progress in console titlebar
@@ -207,6 +239,10 @@ which means you can modify it, redistribute it or use it however you like.
files in the current directory to debug
problems
--print-traffic Display sent and read HTTP traffic
+ -C, --call-home Contact the youtube-dl server for
+ debugging.
+ --no-call-home Do NOT contact the youtube-dl server for
+ debugging.
## Workarounds:
--encoding ENCODING Force the specified encoding (experimental)
@@ -223,18 +259,38 @@ which means you can modify it, redistribute it or use it however you like.
--bidi-workaround Work around terminals that lack
bidirectional text support. Requires bidiv
or fribidi executable in PATH
+ --sleep-interval SECONDS Number of seconds to sleep before each
+ download.
## Video Format Options:
-f, --format FORMAT video format code, specify the order of
- preference using slashes: -f 22/17/18 . -f
- mp4 , -f m4a and -f flv are also
- supported. You can also use the special
- names "best", "bestvideo", "bestaudio",
- "worst", "worstvideo" and "worstaudio". By
+ preference using slashes, as in -f 22/17/18
+ . Instead of format codes, you can select
+ by extension for the extensions aac, m4a,
+ mp3, mp4, ogg, wav, webm. You can also use
+ the special names "best", "bestvideo",
+ "bestaudio", "worst". You can filter the
+ video results by putting a condition in
+ brackets, as in -f "best[height=720]" (or
+ -f "[filesize>10M]"). This works for
+ filesize, height, width, tbr, abr, and vbr
+ and the comparisons <, <=, >, >=, =, != .
+ Formats for which the value is not known
+ are excluded unless you put a question mark
+ (?) after the operator. You can combine
+ format filters, so -f "[height <=?
+ 720][tbr>500]" selects up to 720p videos
+ (or videos where the height is not known)
+ with a bitrate of at least 500 KBit/s. By
default, youtube-dl will pick the best
quality. Use commas to download multiple
- audio formats, such as -f
- 136/137/mp4/bestvideo,140/m4a/bestaudio
+ audio formats, such as -f
+ 136/137/mp4/bestvideo,140/m4a/bestaudio.
+ You can merge the video and audio of two
+ formats into a single file using -f <video-
+ format>+<audio-format> (requires ffmpeg or
+ avconv), for example -f
+ bestvideo+bestaudio.
--all-formats download all available video formats
--prefer-free-formats prefer free video formats unless a specific
one is requested
@@ -242,6 +298,10 @@ which means you can modify it, redistribute it or use it however you like.
-F, --list-formats list all available formats
--youtube-skip-dash-manifest Do not download the DASH manifest on
YouTube videos
+ --merge-output-format FORMAT If a merge is required (e.g.
+ bestvideo+bestaudio), output to given
+ container format. One of mkv, mp4, ogg,
+ webm, flv.Ignored if no merge is required
## Subtitle Options:
--write-sub write subtitle file
@@ -258,7 +318,8 @@ which means you can modify it, redistribute it or use it however you like.
## Authentication Options:
-u, --username USERNAME login with this account ID
- -p, --password PASSWORD account password
+ -p, --password PASSWORD account password. If this option is left
+ out, youtube-dl will ask interactively.
-2, --twofactor TWOFACTOR two-factor auth code
-n, --netrc use .netrc authentication data
--video-password PASSWORD video password (vimeo, smotri)
@@ -288,6 +349,11 @@ which means you can modify it, redistribute it or use it however you like.
--add-metadata write metadata to the video file
--xattrs write metadata to the video file's xattrs
(using dublin core and xdg standards)
+ --fixup POLICY (experimental) Automatically correct known
+ faults of the file. One of never (do
+ nothing), warn (only emit a warning),
+ detect_or_warn(check whether we can do
+ anything about it, warn otherwise
--prefer-avconv Prefer avconv over ffmpeg for running the
postprocessors (default)
--prefer-ffmpeg Prefer ffmpeg over avconv for running the
@@ -299,7 +365,7 @@ which means you can modify it, redistribute it or use it however you like.
# CONFIGURATION
-You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
+You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<user name>\youtube-dl.conf`.
# OUTPUT TEMPLATE
@@ -375,7 +441,7 @@ Again, from then on you'll be able to update with `sudo youtube-dl -U`.
YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.
-If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to report bugs to the Ubuntu packaging guys - all they have to do is update the package to a somewhat recent version. See above for a way to update.
+If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update.
### Do I always have to pass in `--max-quality FORMAT`, or `-citw`?
@@ -393,9 +459,15 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much.
Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/).
-### The links provided by youtube-dl -g are not working anymore
+### I extracted a video URL with -g, but it does not play on another machine / in my webbrowser.
+
+It depends a lot on the service. In many cases, requests for the video (to download/play it) must come from the same IP address and with the same cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl.
-The URLs youtube-dl outputs require the downloader to have the correct cookies. Use the `--cookies` option to write the required cookies into a file, and advise your downloader to read cookies from that file. Some sites also require a common user agent to be used, use `--dump-user-agent` to see the one in use by youtube-dl.
+It may be beneficial to use IPv6; in some cases, the restrictions are only applied to IPv4. Some services (sometimes only for a subset of videos) do not restrict the video URL by IP address, cookie, or user-agent, but these are the exception rather than the rule.
+
+Please bear in mind that some URL protocols are **not** supported by browsers out of the box, including RTMP. If you are using -g, your own downloader must support these as well.
+
+If you want to play the video on a machine that is not running youtube-dl, you can relay the video content from the machine that runs youtube-dl. You can use `-o -` to let youtube-dl stream a video to stdout, or simply allow the player to download the files written by youtube-dl in turn.
### ERROR: no fmt_url_map or conn information found in video info
@@ -422,6 +494,26 @@ Since June 2012 (#342) youtube-dl is packed as an executable zipfile, simply unz
To run the exe you need to install first the [Microsoft Visual C++ 2008 Redistributable Package](http://www.microsoft.com/en-us/download/details.aspx?id=29).
+### On Windows, how should I set up ffmpeg and youtube-dl? Where should I put the exe files?
+
+If you put youtube-dl and ffmpeg in the same directory that you're running the command from, it will work, but that's rather cumbersome.
+
+To make a different directory work - either for ffmpeg, or for youtube-dl, or for both - simply create the directory (say, `C:\bin`, or `C:\Users\<User name>\bin`), put all the executables directly in there, and then [set your PATH environment variable](https://www.java.com/en/download/help/path.xml) to include that directory.
+
+From then on, after restarting your shell, you will be able to access both youtube-dl and ffmpeg (and youtube-dl will be able to find ffmpeg) by simply typing `youtube-dl` or `ffmpeg`, no matter what directory you're in.
+
+### How do I put downloads into a specific folder?
+
+Use the `-o` to specify an [output template](#output-template), for example `-o "/home/user/videos/%(title)s-%(id)s.%(ext)s"`. If you want this for all of your downloads, put the option into your [configuration file](#configuration).
+
+### How can I detect whether a given URL is supported by youtube-dl?
+
+For one, have a look at the [list of supported sites](docs/supportedsites). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/v/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug.
+
+It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor.
+
+If you want to find out whether a given URL is supported, simply call youtube-dl with it. If you get no videos back, chances are the URL is either not referring to a video or unsupported. You can find out which by examining the output (if you run youtube-dl on the console) or catching an `UnsupportedError` exception if you run it from a Python program.
+
# DEVELOPER INSTRUCTIONS
Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
@@ -479,22 +571,23 @@ If you want to add support for a new site, you can follow this quick list (assum
def _real_extract(self, url):
video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
# TODO more code goes here, for example ...
- webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
return {
'id': video_id,
'title': title,
+ 'description': self._og_search_description(webpage),
# TODO more properties (see youtube_dl/extractor/common.py)
}
```
5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
-8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
-9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this:
+8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8).
+9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
$ git add youtube_dl/extractor/__init__.py
$ git add youtube_dl/extractor/yourextractor.py
@@ -505,15 +598,67 @@ If you want to add support for a new site, you can follow this quick list (assum
In any case, thank you very much for your contributions!
+# EMBEDDING YOUTUBE-DL
+
+youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/rg3/youtube-dl/issues/new).
+
+From a Python program, you can embed youtube-dl in a more powerful fashion, like this:
+
+```python
+import youtube_dl
+
+ydl_opts = {}
+with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+ ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
+```
+
+Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L69). For a start, if you want to intercept youtube-dl's output, set a `logger` object.
+
+Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file:
+
+```python
+import youtube_dl
+
+
+class MyLogger(object):
+ def debug(self, msg):
+ pass
+
+ def warning(self, msg):
+ pass
+
+ def error(self, msg):
+ print(msg)
+
+
+def my_hook(d):
+ if d['status'] == 'finished':
+ print('Done downloading, now converting ...')
+
+
+ydl_opts = {
+ 'format': 'bestaudio/best',
+ 'postprocessors': [{
+ 'key': 'FFmpegExtractAudio',
+ 'preferredcodec': 'mp3',
+ 'preferredquality': '192',
+ }],
+ 'logger': MyLogger(),
+ 'progress_hooks': [my_hook],
+}
+with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+ ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
+```
+
# BUGS
-Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues> . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email.
+Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues> . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the irc channel #youtube-dl on freenode.
-Please include the full output of the command when run with `--verbose`. The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever.
+**Please include the full output of youtube-dl when run with `-v`**.
-For discussions, join us in the irc channel #youtube-dl on freenode.
+The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever.
-When you submit a request, please re-read it once to avoid a couple of mistakes (you can and should use this as a checklist):
+Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist):
### Is the description of the issue itself sufficient?
diff --git a/devscripts/bash-completion.py b/devscripts/bash-completion.py
index 49287724d..cd26cc089 100755
--- a/devscripts/bash-completion.py
+++ b/devscripts/bash-completion.py
@@ -1,4 +1,6 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
+
import os
from os.path import dirname as dirn
import sys
@@ -9,16 +11,17 @@ import youtube_dl
BASH_COMPLETION_FILE = "youtube-dl.bash-completion"
BASH_COMPLETION_TEMPLATE = "devscripts/bash-completion.in"
+
def build_completion(opt_parser):
opts_flag = []
for group in opt_parser.option_groups:
for option in group.option_list:
- #for every long flag
+ # for every long flag
opts_flag.append(option.get_opt_string())
with open(BASH_COMPLETION_TEMPLATE) as f:
template = f.read()
with open(BASH_COMPLETION_FILE, "w") as f:
- #just using the special char
+ # just using the special char
filled_template = template.replace("{{flags}}", " ".join(opts_flag))
f.write(filled_template)
diff --git a/devscripts/buildserver.py b/devscripts/buildserver.py
index e0c3cc83e..7c2f49f8b 100644
--- a/devscripts/buildserver.py
+++ b/devscripts/buildserver.py
@@ -142,7 +142,7 @@ def win_service_set_status(handle, status_code):
def win_service_main(service_name, real_main, argc, argv_raw):
try:
- #args = [argv_raw[i].value for i in range(argc)]
+ # args = [argv_raw[i].value for i in range(argc)]
stop_event = threading.Event()
handler = HandlerEx(functools.partial(stop_event, win_service_handler))
h = advapi32.RegisterServiceCtrlHandlerExW(service_name, handler, None)
@@ -233,6 +233,7 @@ def rmtree(path):
#==============================================================================
+
class BuildError(Exception):
def __init__(self, output, code=500):
self.output = output
@@ -369,7 +370,7 @@ class Builder(PythonBuilder, GITBuilder, YoutubeDLBuilder, DownloadBuilder, Clea
class BuildHTTPRequestHandler(BaseHTTPRequestHandler):
- actionDict = { 'build': Builder, 'download': Builder } # They're the same, no more caching.
+ actionDict = {'build': Builder, 'download': Builder} # They're the same, no more caching.
def do_GET(self):
path = urlparse.urlparse(self.path)
diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py
index 86aa37b5f..216282712 100644
--- a/devscripts/check-porn.py
+++ b/devscripts/check-porn.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
"""
This script employs a VERY basic heuristic ('porn' in webpage.lower()) to check
diff --git a/devscripts/fish-completion.py b/devscripts/fish-completion.py
index f4aaf0201..c2f238798 100755
--- a/devscripts/fish-completion.py
+++ b/devscripts/fish-completion.py
@@ -23,13 +23,13 @@ EXTRA_ARGS = {
'batch-file': ['--require-parameter'],
}
+
def build_completion(opt_parser):
commands = []
for group in opt_parser.option_groups:
for option in group.option_list:
long_option = option.get_opt_string().strip('-')
- help_msg = shell_quote([option.help])
complete_cmd = ['complete', '--command', 'youtube-dl', '--long-option', long_option]
if option._short_opts:
complete_cmd += ['--short-option', option._short_opts[0].strip('-')]
diff --git a/devscripts/gh-pages/add-version.py b/devscripts/gh-pages/add-version.py
index 35865b2f3..867ea0048 100755
--- a/devscripts/gh-pages/add-version.py
+++ b/devscripts/gh-pages/add-version.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+from __future__ import unicode_literals
import json
import sys
diff --git a/devscripts/gh-pages/generate-download.py b/devscripts/gh-pages/generate-download.py
index 55912e12c..392e3ba21 100755
--- a/devscripts/gh-pages/generate-download.py
+++ b/devscripts/gh-pages/generate-download.py
@@ -1,8 +1,7 @@
#!/usr/bin/env python3
+from __future__ import unicode_literals
+
import hashlib
-import shutil
-import subprocess
-import tempfile
import urllib.request
import json
diff --git a/devscripts/gh-pages/sign-versions.py b/devscripts/gh-pages/sign-versions.py
index 8a824df56..fa389c358 100755
--- a/devscripts/gh-pages/sign-versions.py
+++ b/devscripts/gh-pages/sign-versions.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+from __future__ import unicode_literals, with_statement
import rsa
import json
@@ -11,22 +12,23 @@ except NameError:
versions_info = json.load(open('update/versions.json'))
if 'signature' in versions_info:
- del versions_info['signature']
+ del versions_info['signature']
print('Enter the PKCS1 private key, followed by a blank line:')
privkey = b''
while True:
- try:
- line = input()
- except EOFError:
- break
- if line == '':
- break
- privkey += line.encode('ascii') + b'\n'
+ try:
+ line = input()
+ except EOFError:
+ break
+ if line == '':
+ break
+ privkey += line.encode('ascii') + b'\n'
privkey = rsa.PrivateKey.load_pkcs1(privkey)
signature = hexlify(rsa.pkcs1.sign(json.dumps(versions_info, sort_keys=True).encode('utf-8'), privkey, 'SHA-256')).decode()
print('signature: ' + signature)
versions_info['signature'] = signature
-json.dump(versions_info, open('update/versions.json', 'w'), indent=4, sort_keys=True) \ No newline at end of file
+with open('update/versions.json', 'w') as versionsf:
+ json.dump(versions_info, versionsf, indent=4, sort_keys=True)
diff --git a/devscripts/gh-pages/update-copyright.py b/devscripts/gh-pages/update-copyright.py
index 12c2a9194..3663c8afe 100755
--- a/devscripts/gh-pages/update-copyright.py
+++ b/devscripts/gh-pages/update-copyright.py
@@ -1,11 +1,11 @@
#!/usr/bin/env python
# coding: utf-8
-from __future__ import with_statement
+from __future__ import with_statement, unicode_literals
import datetime
import glob
-import io # For Python 2 compatibilty
+import io # For Python 2 compatibilty
import os
import re
@@ -13,7 +13,7 @@ year = str(datetime.datetime.now().year)
for fn in glob.glob('*.html*'):
with io.open(fn, encoding='utf-8') as f:
content = f.read()
- newc = re.sub(u'(?P<copyright>Copyright © 2006-)(?P<year>[0-9]{4})', u'Copyright © 2006-' + year, content)
+ newc = re.sub(r'(?P<copyright>Copyright © 2006-)(?P<year>[0-9]{4})', 'Copyright © 2006-' + year, content)
if content != newc:
tmpFn = fn + '.part'
with io.open(tmpFn, 'wt', encoding='utf-8') as outf:
diff --git a/devscripts/gh-pages/update-feed.py b/devscripts/gh-pages/update-feed.py
index 0ba15ae0f..e93eb60fb 100755
--- a/devscripts/gh-pages/update-feed.py
+++ b/devscripts/gh-pages/update-feed.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+from __future__ import unicode_literals
import datetime
import io
@@ -73,4 +74,3 @@ atom_template = atom_template.replace('@ENTRIES@', entries_str)
with io.open('update/releases.atom', 'w', encoding='utf-8') as atom_file:
atom_file.write(atom_template)
-
diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py
index 153e15c8a..d3ef5f0b5 100755
--- a/devscripts/gh-pages/update-sites.py
+++ b/devscripts/gh-pages/update-sites.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+from __future__ import unicode_literals
import sys
import os
@@ -9,19 +10,20 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(
import youtube_dl
+
def main():
with open('supportedsites.html.in', 'r', encoding='utf-8') as tmplf:
template = tmplf.read()
ie_htmls = []
- for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()):
+ for ie in youtube_dl.list_extractors(age_limit=None):
ie_html = '<b>{}</b>'.format(ie.IE_NAME)
ie_desc = getattr(ie, 'IE_DESC', None)
if ie_desc is False:
continue
elif ie_desc is not None:
ie_html += ': {}'.format(ie.IE_DESC)
- if ie.working() == False:
+ if not ie.working():
ie_html += ' (Currently broken)'
ie_htmls.append('<li>{}</li>'.format(ie_html))
diff --git a/devscripts/make_contributing.py b/devscripts/make_contributing.py
new file mode 100755
index 000000000..5e454a429
--- /dev/null
+++ b/devscripts/make_contributing.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
+import io
+import optparse
+import re
+
+
+def main():
+ parser = optparse.OptionParser(usage='%prog INFILE OUTFILE')
+ options, args = parser.parse_args()
+ if len(args) != 2:
+ parser.error('Expected an input and an output filename')
+
+ infile, outfile = args
+
+ with io.open(infile, encoding='utf-8') as inf:
+ readme = inf.read()
+
+ bug_text = re.search(
+ r'(?s)#\s*BUGS\s*[^\n]*\s*(.*?)#\s*COPYRIGHT', readme).group(1)
+ dev_text = re.search(
+ r'(?s)(#\s*DEVELOPER INSTRUCTIONS.*?)#\s*EMBEDDING YOUTUBE-DL',
+ readme).group(1)
+
+ out = bug_text + dev_text
+
+ with io.open(outfile, 'w', encoding='utf-8') as outf:
+ outf.write(out)
+
+if __name__ == '__main__':
+ main()
diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py
index 70fa942dd..8fbce0796 100755
--- a/devscripts/make_readme.py
+++ b/devscripts/make_readme.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import io
import sys
import re
diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py
new file mode 100644
index 000000000..3df4385a6
--- /dev/null
+++ b/devscripts/make_supportedsites.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
+import io
+import optparse
+import os
+import sys
+
+
+# Import youtube_dl
+ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
+sys.path.append(ROOT_DIR)
+import youtube_dl
+
+
+def main():
+ parser = optparse.OptionParser(usage='%prog OUTFILE.md')
+ options, args = parser.parse_args()
+ if len(args) != 1:
+ parser.error('Expected an output filename')
+
+ outfile, = args
+
+ def gen_ies_md(ies):
+ for ie in ies:
+ ie_md = '**{0}**'.format(ie.IE_NAME)
+ ie_desc = getattr(ie, 'IE_DESC', None)
+ if ie_desc is False:
+ continue
+ if ie_desc is not None:
+ ie_md += ': {0}'.format(ie.IE_DESC)
+ if not ie.working():
+ ie_md += ' (Currently broken)'
+ yield ie_md
+
+ ies = sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower())
+ out = '# Supported sites\n' + ''.join(
+ ' - ' + md + '\n'
+ for md in gen_ies_md(ies))
+
+ with io.open(outfile, 'w', encoding='utf-8') as outf:
+ outf.write(out)
+
+if __name__ == '__main__':
+ main()
diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py
index d9c857015..7ece37754 100644
--- a/devscripts/prepare_manpage.py
+++ b/devscripts/prepare_manpage.py
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
import io
import os.path
@@ -10,8 +11,19 @@ README_FILE = os.path.join(ROOT_DIR, 'README.md')
with io.open(README_FILE, encoding='utf-8') as f:
readme = f.read()
-PREFIX = '%YOUTUBE-DL(1)\n\n# NAME\n'
-readme = re.sub(r'(?s)# INSTALLATION.*?(?=# DESCRIPTION)', '', readme)
+PREFIX = '''%YOUTUBE-DL(1)
+
+# NAME
+
+youtube\-dl \- download videos from youtube.com or other video platforms
+
+# SYNOPSIS
+
+**youtube-dl** \[OPTIONS\] URL [URL...]
+
+'''
+readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme)
+readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme)
readme = PREFIX + readme
if sys.version_info < (3, 0):
diff --git a/devscripts/transition_helper.py b/devscripts/transition_helper.py
deleted file mode 100644
index d5ca2d4ba..000000000
--- a/devscripts/transition_helper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os
-
-try:
- import urllib.request as compat_urllib_request
-except ImportError: # Python 2
- import urllib2 as compat_urllib_request
-
-sys.stderr.write(u'Hi! We changed distribution method and now youtube-dl needs to update itself one more time.\n')
-sys.stderr.write(u'This will only happen once. Simply press enter to go on. Sorry for the trouble!\n')
-sys.stderr.write(u'The new location of the binaries is https://github.com/rg3/youtube-dl/downloads, not the git repository.\n\n')
-
-try:
- raw_input()
-except NameError: # Python 3
- input()
-
-filename = sys.argv[0]
-
-API_URL = "https://api.github.com/repos/rg3/youtube-dl/downloads"
-BIN_URL = "https://github.com/downloads/rg3/youtube-dl/youtube-dl"
-
-if not os.access(filename, os.W_OK):
- sys.exit('ERROR: no write permissions on %s' % filename)
-
-try:
- urlh = compat_urllib_request.urlopen(BIN_URL)
- newcontent = urlh.read()
- urlh.close()
-except (IOError, OSError) as err:
- sys.exit('ERROR: unable to download latest version')
-
-try:
- with open(filename, 'wb') as outf:
- outf.write(newcontent)
-except (IOError, OSError) as err:
- sys.exit('ERROR: unable to overwrite current version')
-
-sys.stderr.write(u'Done! Now you can run youtube-dl.\n')
diff --git a/devscripts/transition_helper_exe/setup.py b/devscripts/transition_helper_exe/setup.py
deleted file mode 100644
index aaf5c2983..000000000
--- a/devscripts/transition_helper_exe/setup.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from distutils.core import setup
-import py2exe
-
-py2exe_options = {
- "bundle_files": 1,
- "compressed": 1,
- "optimize": 2,
- "dist_dir": '.',
- "dll_excludes": ['w9xpopen.exe']
-}
-
-setup(console=['youtube-dl.py'], options={ "py2exe": py2exe_options }, zipfile=None) \ No newline at end of file
diff --git a/devscripts/transition_helper_exe/youtube-dl.py b/devscripts/transition_helper_exe/youtube-dl.py
deleted file mode 100644
index 6297dfd40..000000000
--- a/devscripts/transition_helper_exe/youtube-dl.py
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os
-import urllib2
-import json, hashlib
-
-def rsa_verify(message, signature, key):
- from struct import pack
- from hashlib import sha256
- from sys import version_info
- def b(x):
- if version_info[0] == 2: return x
- else: return x.encode('latin1')
- assert(type(message) == type(b('')))
- block_size = 0
- n = key[0]
- while n:
- block_size += 1
- n >>= 8
- signature = pow(int(signature, 16), key[1], key[0])
- raw_bytes = []
- while signature:
- raw_bytes.insert(0, pack("B", signature & 0xFF))
- signature >>= 8
- signature = (block_size - len(raw_bytes)) * b('\x00') + b('').join(raw_bytes)
- if signature[0:2] != b('\x00\x01'): return False
- signature = signature[2:]
- if not b('\x00') in signature: return False
- signature = signature[signature.index(b('\x00'))+1:]
- if not signature.startswith(b('\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20')): return False
- signature = signature[19:]
- if signature != sha256(message).digest(): return False
- return True
-
-sys.stderr.write(u'Hi! We changed distribution method and now youtube-dl needs to update itself one more time.\n')
-sys.stderr.write(u'This will only happen once. Simply press enter to go on. Sorry for the trouble!\n')
-sys.stderr.write(u'From now on, get the binaries from http://rg3.github.com/youtube-dl/download.html, not from the git repository.\n\n')
-
-raw_input()
-
-filename = sys.argv[0]
-
-UPDATE_URL = "http://rg3.github.io/youtube-dl/update/"
-VERSION_URL = UPDATE_URL + 'LATEST_VERSION'
-JSON_URL = UPDATE_URL + 'versions.json'
-UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
-
-if not os.access(filename, os.W_OK):
- sys.exit('ERROR: no write permissions on %s' % filename)
-
-exe = os.path.abspath(filename)
-directory = os.path.dirname(exe)
-if not os.access(directory, os.W_OK):
- sys.exit('ERROR: no write permissions on %s' % directory)
-
-try:
- versions_info = urllib2.urlopen(JSON_URL).read().decode('utf-8')
- versions_info = json.loads(versions_info)
-except:
- sys.exit(u'ERROR: can\'t obtain versions info. Please try again later.')
-if not 'signature' in versions_info:
- sys.exit(u'ERROR: the versions file is not signed or corrupted. Aborting.')
-signature = versions_info['signature']
-del versions_info['signature']
-if not rsa_verify(json.dumps(versions_info, sort_keys=True), signature, UPDATES_RSA_KEY):
- sys.exit(u'ERROR: the versions file signature is invalid. Aborting.')
-
-version = versions_info['versions'][versions_info['latest']]
-
-try:
- urlh = urllib2.urlopen(version['exe'][0])
- newcontent = urlh.read()
- urlh.close()
-except (IOError, OSError) as err:
- sys.exit('ERROR: unable to download latest version')
-
-newcontent_hash = hashlib.sha256(newcontent).hexdigest()
-if newcontent_hash != version['exe'][1]:
- sys.exit(u'ERROR: the downloaded file hash does not match. Aborting.')
-
-try:
- with open(exe + '.new', 'wb') as outf:
- outf.write(newcontent)
-except (IOError, OSError) as err:
- sys.exit(u'ERROR: unable to write the new version')
-
-try:
- bat = os.path.join(directory, 'youtube-dl-updater.bat')
- b = open(bat, 'w')
- b.write("""
-echo Updating youtube-dl...
-ping 127.0.0.1 -n 5 -w 1000 > NUL
-move /Y "%s.new" "%s"
-del "%s"
- \n""" %(exe, exe, bat))
- b.close()
-
- os.startfile(bat)
-except (IOError, OSError) as err:
- sys.exit('ERROR: unable to overwrite current version')
-
-sys.stderr.write(u'Done! Now you can run youtube-dl.\n')
diff --git a/devscripts/zsh-completion.py b/devscripts/zsh-completion.py
index e8d71928a..f200f2c80 100755
--- a/devscripts/zsh-completion.py
+++ b/devscripts/zsh-completion.py
@@ -1,4 +1,6 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
+
import os
from os.path import dirname as dirn
import sys
diff --git a/docs/conf.py b/docs/conf.py
index 4a04ad779..594ca61a6 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -44,8 +44,8 @@ copyright = u'2014, Ricardo Garcia Gonzalez'
# built documents.
#
# The short X.Y version.
-import youtube_dl
-version = youtube_dl.__version__
+from youtube_dl.version import __version__
+version = __version__
# The full version, including alpha/beta/rc tags.
release = version
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
new file mode 100644
index 000000000..dbbf4a797
--- /dev/null
+++ b/docs/supportedsites.md
@@ -0,0 +1,500 @@
+# Supported sites
+ - **1up.com**
+ - **220.ro**
+ - **24video**
+ - **3sat**
+ - **4tube**
+ - **56.com**
+ - **5min**
+ - **8tracks**
+ - **9gag**
+ - **abc.net.au**
+ - **AcademicEarth:Course**
+ - **AddAnime**
+ - **AdobeTV**
+ - **AdultSwim**
+ - **Aftonbladet**
+ - **AlJazeera**
+ - **Allocine**
+ - **anitube.se**
+ - **AnySex**
+ - **Aparat**
+ - **AppleTrailers**
+ - **archive.org**: archive.org videos
+ - **ARD**
+ - **ARD:mediathek**
+ - **arte.tv**
+ - **arte.tv:+7**
+ - **arte.tv:concert**
+ - **arte.tv:creative**
+ - **arte.tv:ddc**
+ - **arte.tv:embed**
+ - **arte.tv:future**
+ - **audiomack**
+ - **AUEngine**
+ - **Azubu**
+ - **bambuser**
+ - **bambuser:channel**
+ - **Bandcamp**
+ - **Bandcamp:album**
+ - **bbc.co.uk**: BBC iPlayer
+ - **Beeg**
+ - **BehindKink**
+ - **Bet**
+ - **Bild**: Bild.de
+ - **BiliBili**
+ - **blinkx**
+ - **blip.tv:user**
+ - **BlipTV**
+ - **Bloomberg**
+ - **Bpb**: Bundeszentrale für politische Bildung
+ - **BR**: Bayerischer Rundfunk Mediathek
+ - **Break**
+ - **Brightcove**
+ - **BuzzFeed**
+ - **BYUtv**
+ - **Canal13cl**
+ - **canalc2.tv**
+ - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv
+ - **CBS**
+ - **CBSNews**: CBS News
+ - **CeskaTelevize**
+ - **channel9**: Channel 9
+ - **Chilloutzone**
+ - **Cinchcast**
+ - **Cinemassacre**
+ - **clipfish**
+ - **cliphunter**
+ - **Clipsyndicate**
+ - **Cloudy**
+ - **Clubic**
+ - **cmt.com**
+ - **CNET**
+ - **CNN**
+ - **CNNBlogs**
+ - **CollegeHumor**
+ - **ComCarCoff**
+ - **ComedyCentral**
+ - **ComedyCentralShows**: The Daily Show / The Colbert Report
+ - **CondeNast**: Condé Nast media group: Condé Nast, GQ, Glamour, Vanity Fair, Vogue, W Magazine, WIRED
+ - **Cracked**
+ - **Criterion**
+ - **Crunchyroll**
+ - **crunchyroll:playlist**
+ - **CSpan**: C-SPAN
+ - **culturebox.francetvinfo.fr**
+ - **dailymotion**
+ - **dailymotion:playlist**
+ - **dailymotion:user**
+ - **daum.net**
+ - **DBTV**
+ - **DeezerPlaylist**
+ - **defense.gouv.fr**
+ - **Discovery**
+ - **divxstage**: DivxStage
+ - **Dotsub**
+ - **Dropbox**
+ - **DrTuber**
+ - **DRTV**
+ - **Dump**
+ - **dvtv**: http://video.aktualne.cz/
+ - **EbaumsWorld**
+ - **eHow**
+ - **Einthusan**
+ - **eitb.tv**
+ - **EllenTV**
+ - **EllenTV:clips**
+ - **ElPais**: El País
+ - **EMPFlix**
+ - **Engadget**
+ - **Eporner**
+ - **Escapist**
+ - **EveryonesMixtape**
+ - **exfm**: ex.fm
+ - **ExpoTV**
+ - **ExtremeTube**
+ - **facebook**
+ - **faz.net**
+ - **fc2**
+ - **fernsehkritik.tv**
+ - **fernsehkritik.tv:postecke**
+ - **Firedrive**
+ - **Firstpost**
+ - **firsttv**: Видеоархив - Первый канал
+ - **Flickr**
+ - **Folketinget**: Folketinget (ft.dk; Danish parliament)
+ - **Foxgay**
+ - **FoxNews**
+ - **france2.fr:generation-quoi**
+ - **FranceCulture**
+ - **FranceInter**
+ - **francetv**: France 2, 3, 4, 5 and Ô
+ - **francetvinfo.fr**
+ - **Freesound**
+ - **freespeech.org**
+ - **FreeVideo**
+ - **FunnyOrDie**
+ - **Gamekings**
+ - **GameOne**
+ - **gameone:playlist**
+ - **GameSpot**
+ - **GameStar**
+ - **Gametrailers**
+ - **GDCVault**
+ - **generic**: Generic downloader that works on some sites
+ - **GiantBomb**
+ - **Glide**: Glide mobile video messages (glide.me)
+ - **Globo**
+ - **GodTube**
+ - **GoldenMoustache**
+ - **Golem**
+ - **GorillaVid**: GorillaVid.in, daclips.in, movpod.in and fastvideo.in
+ - **Goshgay**
+ - **Grooveshark**
+ - **Groupon**
+ - **Hark**
+ - **Heise**
+ - **Helsinki**: helsinki.fi
+ - **HentaiStigma**
+ - **HornBunny**
+ - **HostingBulk**
+ - **HotNewHipHop**
+ - **Howcast**
+ - **HowStuffWorks**
+ - **HuffPost**: Huffington Post
+ - **Hypem**
+ - **Iconosquare**
+ - **ign.com**
+ - **imdb**: Internet Movie Database trailers
+ - **imdb:list**: Internet Movie Database lists
+ - **Ina**
+ - **InfoQ**
+ - **Instagram**
+ - **instagram:user**: Instagram user profile
+ - **InternetVideoArchive**
+ - **IPrima**
+ - **ivi**: ivi.ru
+ - **ivi:compilation**: ivi.ru compilations
+ - **Izlesene**
+ - **JadoreCettePub**
+ - **JeuxVideo**
+ - **Jove**
+ - **jpopsuki.tv**
+ - **Jukebox**
+ - **Kankan**
+ - **keek**
+ - **KeezMovies**
+ - **KhanAcademy**
+ - **KickStarter**
+ - **kontrtube**: KontrTube.ru - Труба зовёт
+ - **KrasView**: Красвью
+ - **Ku6**
+ - **la7.tv**
+ - **Laola1Tv**
+ - **lifenews**: LIFE | NEWS
+ - **LiveLeak**
+ - **livestream**
+ - **livestream:original**
+ - **lrt.lt**
+ - **lynda**: lynda.com videos
+ - **lynda:course**: lynda.com online courses
+ - **m6**
+ - **macgamestore**: MacGameStore trailers
+ - **mailru**: Видео@Mail.Ru
+ - **Malemotion**
+ - **MDR**
+ - **metacafe**
+ - **Metacritic**
+ - **Mgoon**
+ - **Minhateca**
+ - **MinistryGrid**
+ - **mitele.es**
+ - **mixcloud**
+ - **MLB**
+ - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net
+ - **Mofosex**
+ - **Mojvideo**
+ - **Moniker**: allmyvideos.net and vidspot.net
+ - **mooshare**: Mooshare.biz
+ - **Morningstar**: morningstar.com
+ - **Motherless**
+ - **Motorsport**: motorsport.com
+ - **MovieClips**
+ - **Moviezine**
+ - **movshare**: MovShare
+ - **MPORA**
+ - **MTV**
+ - **mtviggy.com**
+ - **mtvservices:embedded**
+ - **MuenchenTV**: münchen.tv
+ - **MusicPlayOn**
+ - **MusicVault**
+ - **muzu.tv**
+ - **MySpace**
+ - **MySpace:album**
+ - **MySpass**
+ - **myvideo**
+ - **MyVidster**
+ - **Naver**
+ - **NBA**
+ - **NBC**
+ - **NBCNews**
+ - **ndr**: NDR.de - Mediathek
+ - **NDTV**
+ - **NerdCubedFeed**
+ - **Newgrounds**
+ - **Newstube**
+ - **nfb**: National Film Board of Canada
+ - **nfl.com**
+ - **nhl.com**
+ - **nhl.com:videocenter**: NHL videocenter category
+ - **niconico**: ニコニコ動画
+ - **NiconicoPlaylist**
+ - **Noco**
+ - **Normalboots**
+ - **NosVideo**
+ - **novamov**: NovaMov
+ - **Nowness**
+ - **nowvideo**: NowVideo
+ - **npo.nl**
+ - **NRK**
+ - **NRKTV**
+ - **NTV**
+ - **Nuvid**
+ - **NYTimes**
+ - **ocw.mit.edu**
+ - **OktoberfestTV**
+ - **on.aol.com**
+ - **Ooyala**
+ - **orf:oe1**: Radio Österreich 1
+ - **orf:tvthek**: ORF TVthek
+ - **ORFFM4**: radio FM4
+ - **parliamentlive.tv**: UK parliament videos
+ - **Patreon**
+ - **PBS**
+ - **Phoenix**
+ - **Photobucket**
+ - **PlanetaPlay**
+ - **play.fm**
+ - **played.to**
+ - **Playvid**
+ - **plus.google**: Google Plus
+ - **pluzz.francetv.fr**
+ - **podomatic**
+ - **PornHd**
+ - **PornHub**
+ - **Pornotube**
+ - **PornoXO**
+ - **PromptFile**
+ - **prosiebensat1**: ProSiebenSat.1 Digital
+ - **Pyvideo**
+ - **QuickVid**
+ - **radio.de**
+ - **radiofrance**
+ - **Rai**
+ - **RBMARadio**
+ - **RedTube**
+ - **Restudy**
+ - **ReverbNation**
+ - **RingTV**
+ - **RottenTomatoes**
+ - **Roxwel**
+ - **RTBF**
+ - **RTLnow**
+ - **rtlxl.nl**
+ - **RTP**
+ - **RTS**: RTS.ch
+ - **rtve.es:alacarta**: RTVE a la carta
+ - **rtve.es:live**: RTVE.es live streams
+ - **RUHD**
+ - **rutube**: Rutube videos
+ - **rutube:channel**: Rutube channels
+ - **rutube:movie**: Rutube movies
+ - **rutube:person**: Rutube person videos
+ - **RUTV**: RUTV.RU
+ - **Sapo**: SAPO Vídeos
+ - **savefrom.net**
+ - **SBS**: sbs.com.au
+ - **SciVee**
+ - **screen.yahoo:search**: Yahoo screen search
+ - **Screencast**
+ - **ScreencastOMatic**
+ - **ScreenwaveMedia**
+ - **ServingSys**
+ - **Sexu**
+ - **SexyKarma**: Sexy Karma and Watch Indian Porn
+ - **Shared**
+ - **ShareSix**
+ - **Sina**
+ - **Slideshare**
+ - **Slutload**
+ - **smotri**: Smotri.com
+ - **smotri:broadcast**: Smotri.com broadcasts
+ - **smotri:community**: Smotri.com community videos
+ - **smotri:user**: Smotri.com user videos
+ - **Snotr**
+ - **Sockshare**
+ - **Sohu**
+ - **soundcloud**
+ - **soundcloud:playlist**
+ - **soundcloud:set**
+ - **soundcloud:user**
+ - **Soundgasm**
+ - **southpark.cc.com**
+ - **southpark.de**
+ - **Space**
+ - **Spankwire**
+ - **Spiegel**
+ - **Spiegel:Article**: Articles on spiegel.de
+ - **Spiegeltv**
+ - **Spike**
+ - **Sport5**
+ - **SportBox**
+ - **SportDeutschland**
+ - **SRMediathek**: Süddeutscher Rundfunk
+ - **stanfordoc**: Stanford Open ClassRoom
+ - **Steam**
+ - **streamcloud.eu**
+ - **StreamCZ**
+ - **SunPorno**
+ - **SWRMediathek**
+ - **Syfy**
+ - **SztvHu**
+ - **Tagesschau**
+ - **Tapely**
+ - **Tass**
+ - **teachertube**: teachertube.com videos
+ - **teachertube:user:collection**: teachertube.com user and collection videos
+ - **TeachingChannel**
+ - **Teamcoco**
+ - **TeamFour**
+ - **TechTalks**
+ - **techtv.mit.edu**
+ - **TED**
+ - **tegenlicht.vpro.nl**
+ - **TeleBruxelles**
+ - **telecinco.es**
+ - **TeleMB**
+ - **TenPlay**
+ - **TF1**
+ - **TheOnion**
+ - **ThePlatform**
+ - **TheSixtyOne**
+ - **ThisAV**
+ - **THVideo**
+ - **THVideoPlaylist**
+ - **tinypic**: tinypic.com videos
+ - **tlc.com**
+ - **tlc.de**
+ - **TMZ**
+ - **TNAFlix**
+ - **tou.tv**
+ - **Toypics**: Toypics user profile
+ - **ToypicsUser**: Toypics user profile
+ - **TrailerAddict** (Currently broken)
+ - **Trilulilu**
+ - **TruTube**
+ - **Tube8**
+ - **Tudou**
+ - **Tumblr**
+ - **TuneIn**
+ - **Turbo**
+ - **Tutv**
+ - **tv.dfb.de**
+ - **tvigle**: Интернет-телевидение Tvigle.ru
+ - **tvp.pl**
+ - **TVPlay**: TV3Play and related services
+ - **Twitch**
+ - **Ubu**
+ - **udemy**
+ - **udemy:course**
+ - **Unistra**
+ - **Urort**: NRK P3 Urørt
+ - **ustream**
+ - **ustream:channel**
+ - **Vbox7**
+ - **VeeHD**
+ - **Veoh**
+ - **Vesti**: Вести.Ru
+ - **Vevo**
+ - **VGTV**
+ - **vh1.com**
+ - **Vice**
+ - **Viddler**
+ - **video.google:search**: Google Video search
+ - **video.mit.edu**
+ - **VideoBam**
+ - **VideoDetective**
+ - **videofy.me**
+ - **videolectures.net**
+ - **VideoMega**
+ - **VideoPremium**
+ - **VideoTt**: video.tt - Your True Tube
+ - **videoweed**: VideoWeed
+ - **Vidme**
+ - **Vidzi**
+ - **viki**
+ - **vimeo**
+ - **vimeo:album**
+ - **vimeo:channel**
+ - **vimeo:group**
+ - **vimeo:likes**: Vimeo user likes
+ - **vimeo:review**: Review pages on vimeo
+ - **vimeo:user**
+ - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)
+ - **Vimple**: Vimple.ru
+ - **Vine**
+ - **vine:user**
+ - **vk.com**
+ - **vk.com:user-videos**: vk.com:All of a user's videos
+ - **Vodlocker**
+ - **Vporn**
+ - **VRT**
+ - **vube**: Vube.com
+ - **VuClip**
+ - **vulture.com**
+ - **Walla**
+ - **WashingtonPost**
+ - **wat.tv**
+ - **WayOfTheMaster**
+ - **WDR**
+ - **wdr:mobile**
+ - **WDRMaus**: Sendung mit der Maus
+ - **Weibo**
+ - **Wimp**
+ - **Wistia**
+ - **WorldStarHipHop**
+ - **wrzuta.pl**
+ - **XBef**
+ - **XboxClips**
+ - **XHamster**
+ - **XMinus**
+ - **XNXX**
+ - **XTube**
+ - **XTubeUser**: XTube user profile
+ - **XVideos**
+ - **Yahoo**: Yahoo screen and movies
+ - **YesJapan**
+ - **Ynet**
+ - **YouJizz**
+ - **Youku**
+ - **YouPorn**
+ - **YourUpload**
+ - **youtube**: YouTube.com
+ - **youtube:channel**: YouTube.com channels
+ - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
+ - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
+ - **youtube:playlist**: YouTube.com playlists
+ - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
+ - **youtube:search**: YouTube.com searches
+ - **youtube:search:date**: YouTube.com searches, newest videos first
+ - **youtube:search_url**: YouTube.com search URLs
+ - **youtube:show**: YouTube.com (multi-season) shows
+ - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
+ - **youtube:toplist**: YouTube.com top lists, "yttoplist:{channel}:{list title}" (Example: "yttoplist:music:Top Tracks")
+ - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
+ - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
+ - **ZDF**
+ - **ZDFChannel**
+ - **zingmp3:album**: mp3.zing.vn albums
+ - **zingmp3:song**: mp3.zing.vn songs
diff --git a/setup.cfg b/setup.cfg
index e57d130e3..13dcd8af6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,6 @@
[wheel]
universal = True
+
+[flake8]
+exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,setup.py,build
+ignore = E501
diff --git a/setup.py b/setup.py
index cf6b92b0f..4686260e0 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,6 @@
from __future__ import print_function
import os.path
-import pkg_resources
import warnings
import sys
@@ -103,7 +102,9 @@ setup(
"Programming Language :: Python :: 2.6",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.3"
+ "Programming Language :: Python :: 3.2",
+ "Programming Language :: Python :: 3.3",
+ "Programming Language :: Python :: 3.4",
],
**params
diff --git a/test/helper.py b/test/helper.py
index 62cb3ce02..c416f388c 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -57,9 +57,9 @@ class FakeYDL(YoutubeDL):
# Different instances of the downloader can't share the same dictionary
# some test set the "sublang" parameter, which would break the md5 checks.
params = get_params(override=override)
- super(FakeYDL, self).__init__(params)
+ super(FakeYDL, self).__init__(params, auto_init=False)
self.result = []
-
+
def to_screen(self, s, skip_eol=None):
print(s)
@@ -72,32 +72,24 @@ class FakeYDL(YoutubeDL):
def expect_warning(self, regex):
# Silence an expected warning matching a regex
old_report_warning = self.report_warning
+
def report_warning(self, message):
- if re.match(regex, message): return
+ if re.match(regex, message):
+ return
old_report_warning(message)
self.report_warning = types.MethodType(report_warning, self)
def gettestcases(include_onlymatching=False):
for ie in youtube_dl.extractor.gen_extractors():
- t = getattr(ie, '_TEST', None)
- if t:
- assert not hasattr(ie, '_TESTS'), \
- '%s has _TEST and _TESTS' % type(ie).__name__
- tests = [t]
- else:
- tests = getattr(ie, '_TESTS', [])
- for t in tests:
- if not include_onlymatching and t.get('only_matching', False):
- continue
- t['name'] = type(ie).__name__[:-len('IE')]
- yield t
+ for tc in ie.get_testcases(include_onlymatching):
+ yield tc
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
-def expect_info_dict(self, expected_dict, got_dict):
+def expect_info_dict(self, got_dict, expected_dict):
for info_field, expected in expected_dict.items():
if isinstance(expected, compat_str) and expected.startswith('re:'):
got = got_dict.get(info_field)
@@ -114,14 +106,28 @@ def expect_info_dict(self, expected_dict, got_dict):
elif isinstance(expected, type):
got = got_dict.get(info_field)
self.assertTrue(isinstance(got, expected),
- 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
+ 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
else:
if isinstance(expected, compat_str) and expected.startswith('md5:'):
got = 'md5:' + md5(got_dict.get(info_field))
+ elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
+ got = got_dict.get(info_field)
+ self.assertTrue(
+ isinstance(got, list),
+ 'Expected field %s to be a list, but it is of type %s' % (
+ info_field, type(got).__name__))
+ expected_num = int(expected.partition(':')[2])
+ assertGreaterEqual(
+ self, len(got), expected_num,
+ 'Expected %d items in field %s, but only got %d' % (
+ expected_num, info_field, len(got)
+ )
+ )
+ continue
else:
got = got_dict.get(info_field)
self.assertEqual(expected, got,
- 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+ 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
# Check for the presence of mandatory fields
if got_dict.get('_type') != 'playlist':
@@ -133,19 +139,20 @@ def expect_info_dict(self, expected_dict, got_dict):
# Are checkable fields missing from the test case definition?
test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
- for key, value in got_dict.items()
- if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
+ for key, value in got_dict.items()
+ if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
if missing_keys:
def _repr(v):
if isinstance(v, compat_str):
- return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'")
+ return "'%s'" % v.replace('\\', '\\\\').replace("'", "\\'").replace('\n', '\\n')
else:
return repr(v)
info_dict_str = ''.join(
' %s: %s,\n' % (_repr(k), _repr(v))
for k, v in test_info_dict.items())
- write_string('\n"info_dict": {' + info_dict_str + '}\n', out=sys.stderr)
+ write_string(
+ '\n\'info_dict\': {\n' + info_dict_str + '}\n', out=sys.stderr)
self.assertFalse(
missing_keys,
'Missing keys in test definition: %s' % (
@@ -158,7 +165,9 @@ def assertRegexpMatches(self, text, regexp, msg=None):
else:
m = re.match(regexp, text)
if not m:
- note = 'Regexp didn\'t match: %r not found in %r' % (regexp, text)
+ note = 'Regexp didn\'t match: %r not found' % (regexp)
+ if len(text) < 1000:
+ note += ' in %r' % text
if msg is None:
msg = note
else:
@@ -171,3 +180,13 @@ def assertGreaterEqual(self, got, expected, msg=None):
if msg is None:
msg = '%r not greater than or equal to %r' % (got, expected)
self.assertTrue(got >= expected, msg)
+
+
+def expect_warnings(ydl, warnings_re):
+ real_warning = ydl.report_warning
+
+ def _report_warning(w):
+ if not any(re.search(w_re, w) for w_re in warnings_re):
+ real_warning(w)
+
+ ydl.report_warning = _report_warning
diff --git a/test/swftests/ConstArrayAccess.as b/test/swftests/ConstArrayAccess.as
new file mode 100644
index 000000000..07dc3f460
--- /dev/null
+++ b/test/swftests/ConstArrayAccess.as
@@ -0,0 +1,18 @@
+// input: []
+// output: 4
+
+package {
+public class ConstArrayAccess {
+ private static const x:int = 2;
+ private static const ar:Array = ["42", "3411"];
+
+ public static function main():int{
+ var c:ConstArrayAccess = new ConstArrayAccess();
+ return c.f();
+ }
+
+ public function f(): int {
+ return ar[1].length;
+ }
+}
+}
diff --git a/test/swftests/ConstantInt.as b/test/swftests/ConstantInt.as
new file mode 100644
index 000000000..e0bbb6166
--- /dev/null
+++ b/test/swftests/ConstantInt.as
@@ -0,0 +1,12 @@
+// input: []
+// output: 2
+
+package {
+public class ConstantInt {
+ private static const x:int = 2;
+
+ public static function main():int{
+ return x;
+ }
+}
+}
diff --git a/test/swftests/DictCall.as b/test/swftests/DictCall.as
new file mode 100644
index 000000000..c2d174cc2
--- /dev/null
+++ b/test/swftests/DictCall.as
@@ -0,0 +1,10 @@
+// input: [{"x": 1, "y": 2}]
+// output: 3
+
+package {
+public class DictCall {
+ public static function main(d:Object):int{
+ return d.x + d.y;
+ }
+}
+}
diff --git a/test/swftests/EqualsOperator.as b/test/swftests/EqualsOperator.as
new file mode 100644
index 000000000..837a69a46
--- /dev/null
+++ b/test/swftests/EqualsOperator.as
@@ -0,0 +1,10 @@
+// input: []
+// output: false
+
+package {
+public class EqualsOperator {
+ public static function main():Boolean{
+ return 1 == 2;
+ }
+}
+}
diff --git a/test/swftests/MemberAssignment.as b/test/swftests/MemberAssignment.as
new file mode 100644
index 000000000..dcba5e3ff
--- /dev/null
+++ b/test/swftests/MemberAssignment.as
@@ -0,0 +1,22 @@
+// input: [1]
+// output: 2
+
+package {
+public class MemberAssignment {
+ public var v:int;
+
+ public function g():int {
+ return this.v;
+ }
+
+ public function f(a:int):int{
+ this.v = a;
+ return this.v + this.g();
+ }
+
+ public static function main(a:int): int {
+ var v:MemberAssignment = new MemberAssignment();
+ return v.f(a);
+ }
+}
+}
diff --git a/test/swftests/NeOperator.as b/test/swftests/NeOperator.as
new file mode 100644
index 000000000..61dcbc4e9
--- /dev/null
+++ b/test/swftests/NeOperator.as
@@ -0,0 +1,24 @@
+// input: []
+// output: 123
+
+package {
+public class NeOperator {
+ public static function main(): int {
+ var res:int = 0;
+ if (1 != 2) {
+ res += 3;
+ } else {
+ res += 4;
+ }
+ if (2 != 2) {
+ res += 10;
+ } else {
+ res += 20;
+ }
+ if (9 == 9) {
+ res += 100;
+ }
+ return res;
+ }
+}
+}
diff --git a/test/swftests/PrivateVoidCall.as b/test/swftests/PrivateVoidCall.as
new file mode 100644
index 000000000..2cc016797
--- /dev/null
+++ b/test/swftests/PrivateVoidCall.as
@@ -0,0 +1,22 @@
+// input: []
+// output: 9
+
+package {
+public class PrivateVoidCall {
+ public static function main():int{
+ var f:OtherClass = new OtherClass();
+ f.func();
+ return 9;
+ }
+}
+}
+
+class OtherClass {
+ private function pf():void {
+ ;
+ }
+
+ public function func():void {
+ this.pf();
+ }
+}
diff --git a/test/swftests/StringBasics.as b/test/swftests/StringBasics.as
new file mode 100644
index 000000000..d27430b13
--- /dev/null
+++ b/test/swftests/StringBasics.as
@@ -0,0 +1,11 @@
+// input: []
+// output: 3
+
+package {
+public class StringBasics {
+ public static function main():int{
+ var s:String = "abc";
+ return s.length;
+ }
+}
+}
diff --git a/test/swftests/StringCharCodeAt.as b/test/swftests/StringCharCodeAt.as
new file mode 100644
index 000000000..c20d74d65
--- /dev/null
+++ b/test/swftests/StringCharCodeAt.as
@@ -0,0 +1,11 @@
+// input: []
+// output: 9897
+
+package {
+public class StringCharCodeAt {
+ public static function main():int{
+ var s:String = "abc";
+ return s.charCodeAt(1) * 100 + s.charCodeAt();
+ }
+}
+}
diff --git a/test/swftests/StringConversion.as b/test/swftests/StringConversion.as
new file mode 100644
index 000000000..c976f5042
--- /dev/null
+++ b/test/swftests/StringConversion.as
@@ -0,0 +1,11 @@
+// input: []
+// output: 2
+
+package {
+public class StringConversion {
+ public static function main():int{
+ var s:String = String(99);
+ return s.length;
+ }
+}
+}
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index 13c18ed95..be8d12997 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -40,5 +40,23 @@ class TestInfoExtractor(unittest.TestCase):
self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2')
+ def test_html_search_meta(self):
+ ie = self.ie
+ html = '''
+ <meta name="a" content="1" />
+ <meta name='b' content='2'>
+ <meta name="c" content='3'>
+ <meta name=d content='4'>
+ <meta property="e" content='5' >
+ <meta content="6" name="f">
+ '''
+
+ self.assertEqual(ie._html_search_meta('a', html), '1')
+ self.assertEqual(ie._html_search_meta('b', html), '2')
+ self.assertEqual(ie._html_search_meta('c', html), '3')
+ self.assertEqual(ie._html_search_meta('d', html), '4')
+ self.assertEqual(ie._html_search_meta('e', html), '5')
+ self.assertEqual(ie._html_search_meta('f', html), '6')
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index ab61e1976..678b9f7d1 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -8,6 +8,8 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import copy
+
from test.helper import FakeYDL, assertRegexpMatches
from youtube_dl import YoutubeDL
from youtube_dl.extractor import YoutubeIE
@@ -192,6 +194,37 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'vid-high')
+ def test_format_selection_audio_exts(self):
+ formats = [
+ {'format_id': 'mp3-64', 'ext': 'mp3', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'},
+ {'format_id': 'ogg-64', 'ext': 'ogg', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'},
+ {'format_id': 'aac-64', 'ext': 'aac', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'},
+ {'format_id': 'mp3-32', 'ext': 'mp3', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'},
+ {'format_id': 'aac-32', 'ext': 'aac', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'},
+ ]
+
+ info_dict = _make_result(formats)
+ ydl = YDL({'format': 'best'})
+ ie = YoutubeIE(ydl)
+ ie._sort_formats(info_dict['formats'])
+ ydl.process_ie_result(copy.deepcopy(info_dict))
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'aac-64')
+
+ ydl = YDL({'format': 'mp3'})
+ ie = YoutubeIE(ydl)
+ ie._sort_formats(info_dict['formats'])
+ ydl.process_ie_result(copy.deepcopy(info_dict))
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'mp3-64')
+
+ ydl = YDL({'prefer_free_formats': True})
+ ie = YoutubeIE(ydl)
+ ie._sort_formats(info_dict['formats'])
+ ydl.process_ie_result(copy.deepcopy(info_dict))
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'ogg-64')
+
def test_format_selection_video(self):
formats = [
{'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'},
@@ -218,7 +251,7 @@ class TestFormatSelection(unittest.TestCase):
# 3D
'85', '84', '102', '83', '101', '82', '100',
# Dash video
- '138', '137', '248', '136', '247', '135', '246',
+ '137', '248', '136', '247', '135', '246',
'245', '244', '134', '243', '133', '242', '160',
# Dash audio
'141', '172', '140', '171', '139',
@@ -248,6 +281,61 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1id)
+ def test_format_filtering(self):
+ formats = [
+ {'format_id': 'A', 'filesize': 500, 'width': 1000},
+ {'format_id': 'B', 'filesize': 1000, 'width': 500},
+ {'format_id': 'C', 'filesize': 1000, 'width': 400},
+ {'format_id': 'D', 'filesize': 2000, 'width': 600},
+ {'format_id': 'E', 'filesize': 3000},
+ {'format_id': 'F'},
+ {'format_id': 'G', 'filesize': 1000000},
+ ]
+ for f in formats:
+ f['url'] = 'http://_/'
+ f['ext'] = 'unknown'
+ info_dict = _make_result(formats)
+
+ ydl = YDL({'format': 'best[filesize<3000]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'D')
+
+ ydl = YDL({'format': 'best[filesize<=3000]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'E')
+
+ ydl = YDL({'format': 'best[filesize <= ? 3000]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'F')
+
+ ydl = YDL({'format': 'best [filesize = 1000] [width>450]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'B')
+
+ ydl = YDL({'format': 'best [filesize = 1000] [width!=450]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'C')
+
+ ydl = YDL({'format': '[filesize>?1]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'G')
+
+ ydl = YDL({'format': '[filesize<1M]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'E')
+
+ ydl = YDL({'format': '[filesize<1MiB]'})
+ ydl.process_ie_result(info_dict)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'G')
+
def test_add_extra_info(self):
test_dict = {
'extractor': 'Foo',
@@ -266,6 +354,7 @@ class TestFormatSelection(unittest.TestCase):
'ext': 'mp4',
'width': None,
}
+
def fname(templ):
ydl = YoutubeDL({'outtmpl': templ})
return ydl.prepare_filename(info)
diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py
index 71e80b037..6f5513faa 100644
--- a/test/test_age_restriction.py
+++ b/test/test_age_restriction.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
# Allow direct execution
import os
@@ -19,7 +20,7 @@ def _download_restricted(url, filename, age):
'age_limit': age,
'skip_download': True,
'writeinfojson': True,
- "outtmpl": "%(id)s.%(ext)s",
+ 'outtmpl': '%(id)s.%(ext)s',
}
ydl = YoutubeDL(params)
ydl.add_default_info_extractors()
@@ -44,11 +45,6 @@ class TestAgeRestriction(unittest.TestCase):
'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
'505835.mp4', 2, old_age=25)
- def test_pornotube(self):
- self._assert_restricted(
- 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing',
- '1689755.flv', 13)
-
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index 965e5d8a5..e66264b4b 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -14,7 +14,6 @@ from test.helper import gettestcases
from youtube_dl.extractor import (
FacebookIE,
gen_extractors,
- TwitchIE,
YoutubeIE,
)
@@ -32,19 +31,19 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
- assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') #585
+ assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
assertPlaylist('PL63F0C78739B09958')
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
- assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
+ assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
- self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
+ self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
@@ -72,18 +71,6 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
- def test_twitch_channelid_matching(self):
- self.assertTrue(TwitchIE.suitable('twitch.tv/vanillatv'))
- self.assertTrue(TwitchIE.suitable('www.twitch.tv/vanillatv'))
- self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv'))
- self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/'))
-
- def test_twitch_videoid_matching(self):
- self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/b/328087483'))
-
- def test_twitch_chapterid_matching(self):
- self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361'))
-
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc')
@@ -115,8 +102,6 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch(':ythistory', ['youtube:history'])
self.assertMatch(':thedailyshow', ['ComedyCentralShows'])
self.assertMatch(':tds', ['ComedyCentralShows'])
- self.assertMatch(':colbertreport', ['ComedyCentralShows'])
- self.assertMatch(':cr', ['ComedyCentralShows'])
def test_vimeo_matching(self):
self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
diff --git a/test/test_compat.py b/test/test_compat.py
new file mode 100644
index 000000000..1eb454e06
--- /dev/null
+++ b/test/test_compat.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+
+from youtube_dl.utils import get_filesystem_encoding
+from youtube_dl.compat import (
+ compat_getenv,
+ compat_expanduser,
+)
+
+
+class TestCompat(unittest.TestCase):
+ def test_compat_getenv(self):
+ test_str = 'тест'
+ os.environ['YOUTUBE-DL-TEST'] = (
+ test_str if sys.version_info >= (3, 0)
+ else test_str.encode(get_filesystem_encoding()))
+ self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str)
+
+ def test_compat_expanduser(self):
+ old_home = os.environ.get('HOME')
+ test_str = 'C:\Documents and Settings\тест\Application Data'
+ os.environ['HOME'] = (
+ test_str if sys.version_info >= (3, 0)
+ else test_str.encode(get_filesystem_encoding()))
+ self.assertEqual(compat_expanduser('~'), test_str)
+ os.environ['HOME'] = old_home
+
+ def test_all_present(self):
+ import youtube_dl.compat
+ all_names = youtube_dl.compat.__all__
+ present_names = set(filter(
+ lambda c: '_' in c and not c.startswith('_'),
+ dir(youtube_dl.compat))) - set(['unicode_literals'])
+ self.assertEqual(all_names, sorted(present_names))
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_download.py b/test/test_download.py
index 8178015ea..412f3dbce 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -1,5 +1,7 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
+
# Allow direct execution
import os
import sys
@@ -8,6 +10,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
assertGreaterEqual,
+ expect_warnings,
get_params,
gettestcases,
expect_info_dict,
@@ -22,10 +25,12 @@ import json
import socket
import youtube_dl.YoutubeDL
-from youtube_dl.utils import (
+from youtube_dl.compat import (
compat_http_client,
compat_urllib_error,
compat_HTTPError,
+)
+from youtube_dl.utils import (
DownloadError,
ExtractorError,
format_bytes,
@@ -35,18 +40,22 @@ from youtube_dl.extractor import get_info_extractor
RETRIES = 3
+
class YoutubeDL(youtube_dl.YoutubeDL):
def __init__(self, *args, **kwargs):
self.to_stderr = self.to_screen
self.processed_info_dicts = []
super(YoutubeDL, self).__init__(*args, **kwargs)
+
def report_warning(self, message):
# Don't accept warnings during tests
raise ExtractorError(message)
+
def process_info(self, info_dict):
self.processed_info_dicts.append(info_dict)
return super(YoutubeDL, self).process_info(info_dict)
+
def _file_md5(fn):
with open(fn, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()
@@ -56,10 +65,13 @@ defs = gettestcases()
class TestDownload(unittest.TestCase):
maxDiff = None
+
def setUp(self):
self.defs = defs
-### Dynamically generate tests
+# Dynamically generate tests
+
+
def generator(test_case):
def test_template(self):
@@ -85,7 +97,7 @@ def generator(test_case):
return
for other_ie in other_ies:
if not other_ie.working():
- print_skipping(u'test depends on %sIE, marked as not WORKING' % other_ie.ie_key())
+ print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key())
return
params = get_params(test_case.get('params', {}))
@@ -93,18 +105,21 @@ def generator(test_case):
params.setdefault('extract_flat', True)
params.setdefault('skip_download', True)
- ydl = YoutubeDL(params)
+ ydl = YoutubeDL(params, auto_init=False)
ydl.add_default_info_extractors()
finished_hook_called = set()
+
def _hook(status):
if status['status'] == 'finished':
finished_hook_called.add(status['filename'])
ydl.add_progress_hook(_hook)
+ expect_warnings(ydl, test_case.get('expected_warnings', []))
def get_tc_filename(tc):
return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {}))
res_dict = None
+
def try_rm_tcs_files(tcs=None):
if tcs is None:
tcs = test_cases
@@ -128,7 +143,7 @@ def generator(test_case):
raise
if try_num == RETRIES:
- report_warning(u'Failed due to network errors, skipping...')
+ report_warning('Failed due to network errors, skipping...')
return
print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num))
@@ -140,7 +155,7 @@ def generator(test_case):
if is_playlist:
self.assertEqual(res_dict['_type'], 'playlist')
self.assertTrue('entries' in res_dict)
- expect_info_dict(self, test_case.get('info_dict', {}), res_dict)
+ expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
if 'playlist_mincount' in test_case:
assertGreaterEqual(
@@ -183,11 +198,13 @@ def generator(test_case):
md5_for_file = _file_md5(tc_filename)
self.assertEqual(md5_for_file, tc['md5'])
info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
- self.assertTrue(os.path.exists(info_json_fn))
+ self.assertTrue(
+ os.path.exists(info_json_fn),
+ 'Missing info file %s' % info_json_fn)
with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
- expect_info_dict(self, tc.get('info_dict', {}), info_dict)
+ expect_info_dict(self, info_dict, tc.get('info_dict', {}))
finally:
try_rm_tcs_files()
if is_playlist and res_dict is not None and res_dict.get('entries'):
@@ -198,15 +215,15 @@ def generator(test_case):
return test_template
-### And add them to TestDownload
+# And add them to TestDownload
for n, test_case in enumerate(defs):
test_method = generator(test_case)
tname = 'test_' + str(test_case['name'])
i = 1
while hasattr(TestDownload, tname):
- tname = 'test_' + str(test_case['name']) + '_' + str(i)
+ tname = 'test_%s_%d' % (test_case['name'], i)
i += 1
- test_method.__name__ = tname
+ test_method.__name__ = str(tname)
setattr(TestDownload, test_method.__name__, test_method)
del test_method
diff --git a/test/test_execution.py b/test/test_execution.py
index 2b115fb31..60df187de 100644
--- a/test/test_execution.py
+++ b/test/test_execution.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
import unittest
import sys
@@ -6,17 +9,19 @@ import subprocess
rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
try:
_DEV_NULL = subprocess.DEVNULL
except AttributeError:
_DEV_NULL = open(os.devnull, 'wb')
+
class TestExecution(unittest.TestCase):
def test_import(self):
subprocess.check_call([sys.executable, '-c', 'import youtube_dl'], cwd=rootDir)
def test_module_exec(self):
- if sys.version_info >= (2,7): # Python 2.6 doesn't support package execution
+ if sys.version_info >= (2, 7): # Python 2.6 doesn't support package execution
subprocess.check_call([sys.executable, '-m', 'youtube_dl', '--version'], cwd=rootDir, stdout=_DEV_NULL)
def test_main_exec(self):
diff --git a/test/test_subtitles.py b/test/test_subtitles.py
index 8f4602e5f..6336dd317 100644
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
# Allow direct execution
import os
@@ -16,12 +17,14 @@ from youtube_dl.extractor import (
TEDIE,
VimeoIE,
WallaIE,
+ CeskaTelevizeIE,
)
class BaseTestSubtitles(unittest.TestCase):
url = None
IE = None
+
def setUp(self):
self.DL = FakeYDL()
self.ie = self.IE(self.DL)
@@ -74,7 +77,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
def test_youtube_list_subtitles(self):
- self.DL.expect_warning(u'Video doesn\'t have automatic captions')
+ self.DL.expect_warning('Video doesn\'t have automatic captions')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
@@ -86,8 +89,16 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
subtitles = self.getSubtitles()
self.assertTrue(subtitles['it'] is not None)
+ def test_youtube_translated_subtitles(self):
+ # This video has a subtitles track, which can be translated
+ self.url = 'Ky9eprVWzlI'
+ self.DL.params['writeautomaticsub'] = True
+ self.DL.params['subtitleslangs'] = ['it']
+ subtitles = self.getSubtitles()
+ self.assertTrue(subtitles['it'] is not None)
+
def test_youtube_nosubtitles(self):
- self.DL.expect_warning(u'video doesn\'t have subtitles')
+ self.DL.expect_warning('video doesn\'t have subtitles')
self.url = 'n5BB19UTcdA'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
@@ -101,7 +112,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
- self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
+ self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
class TestDailymotionSubtitles(BaseTestSubtitles):
@@ -130,20 +141,20 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
self.assertEqual(len(subtitles.keys()), 5)
def test_list_subtitles(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_automatic_captions(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslang'] = ['en']
subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) == 0)
def test_nosubtitles(self):
- self.DL.expect_warning(u'video doesn\'t have subtitles')
+ self.DL.expect_warning('video doesn\'t have subtitles')
self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
@@ -156,7 +167,7 @@ class TestDailymotionSubtitles(BaseTestSubtitles):
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
- self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
+ self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
class TestTedSubtitles(BaseTestSubtitles):
@@ -185,13 +196,13 @@ class TestTedSubtitles(BaseTestSubtitles):
self.assertTrue(len(subtitles.keys()) >= 28)
def test_list_subtitles(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_automatic_captions(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslang'] = ['en']
subtitles = self.getSubtitles()
@@ -203,7 +214,7 @@ class TestTedSubtitles(BaseTestSubtitles):
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
- self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
+ self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
class TestBlipTVSubtitles(BaseTestSubtitles):
@@ -211,13 +222,13 @@ class TestBlipTVSubtitles(BaseTestSubtitles):
IE = BlipTVIE
def test_list_subtitles(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_allsubtitles(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
@@ -236,7 +247,7 @@ class TestVimeoSubtitles(BaseTestSubtitles):
def test_subtitles(self):
self.DL.params['writesubtitles'] = True
subtitles = self.getSubtitles()
- self.assertEqual(md5(subtitles['en']), '8062383cf4dec168fc40a088aa6d5888')
+ self.assertEqual(md5(subtitles['en']), '26399116d23ae3cf2c087cea94bc43b4')
def test_subtitles_lang(self):
self.DL.params['writesubtitles'] = True
@@ -251,20 +262,20 @@ class TestVimeoSubtitles(BaseTestSubtitles):
self.assertEqual(set(subtitles.keys()), set(['de', 'en', 'es', 'fr']))
def test_list_subtitles(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_automatic_captions(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writeautomaticsub'] = True
self.DL.params['subtitleslang'] = ['en']
subtitles = self.getSubtitles()
self.assertTrue(len(subtitles.keys()) == 0)
def test_nosubtitles(self):
- self.DL.expect_warning(u'video doesn\'t have subtitles')
+ self.DL.expect_warning('video doesn\'t have subtitles')
self.url = 'http://vimeo.com/56015672'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
@@ -277,7 +288,7 @@ class TestVimeoSubtitles(BaseTestSubtitles):
self.DL.params['subtitleslangs'] = langs
subtitles = self.getSubtitles()
for lang in langs:
- self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang)
+ self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
class TestWallaSubtitles(BaseTestSubtitles):
@@ -285,13 +296,13 @@ class TestWallaSubtitles(BaseTestSubtitles):
IE = WallaIE
def test_list_subtitles(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['listsubtitles'] = True
info_dict = self.getInfoDict()
self.assertEqual(info_dict, None)
def test_allsubtitles(self):
- self.DL.expect_warning(u'Automatic Captions not supported by this server')
+ self.DL.expect_warning('Automatic Captions not supported by this server')
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
@@ -299,7 +310,7 @@ class TestWallaSubtitles(BaseTestSubtitles):
self.assertEqual(md5(subtitles['heb']), 'e758c5d7cb982f6bef14f377ec7a3920')
def test_nosubtitles(self):
- self.DL.expect_warning(u'video doesn\'t have subtitles')
+ self.DL.expect_warning('video doesn\'t have subtitles')
self.url = 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
@@ -307,5 +318,32 @@ class TestWallaSubtitles(BaseTestSubtitles):
self.assertEqual(len(subtitles), 0)
+class TestCeskaTelevizeSubtitles(BaseTestSubtitles):
+ url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky'
+ IE = CeskaTelevizeIE
+
+ def test_list_subtitles(self):
+ self.DL.expect_warning('Automatic Captions not supported by this server')
+ self.DL.params['listsubtitles'] = True
+ info_dict = self.getInfoDict()
+ self.assertEqual(info_dict, None)
+
+ def test_allsubtitles(self):
+ self.DL.expect_warning('Automatic Captions not supported by this server')
+ self.DL.params['writesubtitles'] = True
+ self.DL.params['allsubtitles'] = True
+ subtitles = self.getSubtitles()
+ self.assertEqual(set(subtitles.keys()), set(['cs']))
+ self.assertEqual(md5(subtitles['cs']), '9bf52d9549533c32c427e264bf0847d4')
+
+ def test_nosubtitles(self):
+ self.DL.expect_warning('video doesn\'t have subtitles')
+ self.url = 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220'
+ self.DL.params['writesubtitles'] = True
+ self.DL.params['allsubtitles'] = True
+ subtitles = self.getSubtitles()
+ self.assertEqual(len(subtitles), 0)
+
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py
index b42cd74c7..9f18055e6 100644
--- a/test/test_swfinterp.py
+++ b/test/test_swfinterp.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
# Allow direct execution
import os
@@ -37,7 +38,9 @@ def _make_testfunc(testfile):
or os.path.getmtime(swf_file) < os.path.getmtime(as_file)):
# Recompile
try:
- subprocess.check_call(['mxmlc', '-output', swf_file, as_file])
+ subprocess.check_call([
+ 'mxmlc', '-output', swf_file,
+ '-static-link-runtime-shared-libraries', as_file])
except OSError as ose:
if ose.errno == errno.ENOENT:
print('mxmlc not found! Skipping test.')
diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py
index a4ba7bad0..7f816698e 100644
--- a/test/test_unicode_literals.py
+++ b/test/test_unicode_literals.py
@@ -1,22 +1,28 @@
from __future__ import unicode_literals
-import io
+# Allow direct execution
import os
-import re
+import sys
import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import io
+import re
rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
IGNORED_FILES = [
'setup.py', # http://bugs.python.org/issue13943
+ 'conf.py',
+ 'buildserver.py',
]
+from test.helper import assertRegexpMatches
+
+
class TestUnicodeLiterals(unittest.TestCase):
def test_all_files(self):
- print('Skipping this test (not yet fully implemented)')
- return
-
for dirpath, _, filenames in os.walk(rootDir):
for basename in filenames:
if not basename.endswith('.py'):
@@ -30,10 +36,11 @@ class TestUnicodeLiterals(unittest.TestCase):
if "'" not in code and '"' not in code:
continue
- imps = 'from __future__ import unicode_literals'
- self.assertTrue(
- imps in code,
- ' %s missing in %s' % (imps, fn))
+ assertRegexpMatches(
+ self,
+ code,
+ r'(?:(?:#.*?|\s*)\n)*from __future__ import (?:[a-z_]+,\s*)*unicode_literals',
+ 'unicode_literals import missing in %s' % fn)
m = re.search(r'(?<=\s)u[\'"](?!\)|,|$)', code)
if m is not None:
diff --git a/test/test_utils.py b/test/test_utils.py
index bcca0efea..ebec7986f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -16,35 +16,43 @@ import json
import xml.etree.ElementTree
from youtube_dl.utils import (
+ age_restricted,
+ args_to_str,
+ clean_html,
DateRange,
+ detect_exe_version,
encodeFilename,
+ escape_rfc3986,
+ escape_url,
find_xpath_attr,
fix_xml_ampersands,
- get_meta_content,
- orderedSet,
- OnDemandPagedList,
InAdvancePagedList,
+ intlist_to_bytes,
+ is_html,
+ js_to_json,
+ limit_length,
+ OnDemandPagedList,
+ orderedSet,
parse_duration,
+ parse_filesize,
+ parse_iso8601,
read_batch_urls,
sanitize_filename,
shell_quote,
smuggle_url,
str_to_int,
+ strip_jsonp,
struct_unpack,
timeconvert,
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ uppercase_escape,
url_basename,
urlencode_postdata,
+ version_tuple,
xpath_with_ns,
- parse_iso8601,
- strip_jsonp,
- uppercase_escape,
- limit_length,
- escape_rfc3986,
- escape_url,
- js_to_json,
+ render_table,
)
@@ -73,6 +81,10 @@ class TestUtil(unittest.TestCase):
tests = '\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0446\u0430'
self.assertEqual(sanitize_filename(tests), tests)
+ self.assertEqual(
+ sanitize_filename('New World record at 0:12:34'),
+ 'New World record at 0_12_34')
+
forbidden = '"\0\\/'
for fc in forbidden:
for fbc in forbidden:
@@ -117,16 +129,16 @@ class TestUtil(unittest.TestCase):
self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])
self.assertEqual(orderedSet([]), [])
self.assertEqual(orderedSet([1]), [1])
- #keep the list ordered
+ # keep the list ordered
self.assertEqual(orderedSet([135, 1, 1, 1]), [135, 1])
def test_unescape_html(self):
self.assertEqual(unescapeHTML('%20;'), '%20;')
self.assertEqual(
unescapeHTML('&eacute;'), 'é')
-
+
def test_daterange(self):
- _20century = DateRange("19000101","20000101")
+ _20century = DateRange("19000101", "20000101")
self.assertFalse("17890714" in _20century)
_ac = DateRange("00010101")
self.assertTrue("19690721" in _ac)
@@ -138,8 +150,12 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('8/7/2009'), '20090708')
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
+ self.assertEqual(unified_strdate('1968 12 10'), '19681210')
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
+ self.assertEqual(
+ unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
+ '20141126')
def test_find_xpath_attr(self):
testxml = '''<root>
@@ -154,17 +170,6 @@ class TestUtil(unittest.TestCase):
self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
- def test_meta_parser(self):
- testhtml = '''
- <head>
- <meta name="description" content="foo &amp; bar">
- <meta content='Plato' name='author'/>
- </head>
- '''
- get_meta = lambda name: get_meta_content(name, testhtml)
- self.assertEqual(get_meta('description'), 'foo & bar')
- self.assertEqual(get_meta('author'), 'Plato')
-
def test_xpath_with_ns(self):
testxml = '''<root xmlns:media="http://example.com/">
<media:song>
@@ -179,7 +184,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3')
def test_smuggle_url(self):
- data = {u"ö": u"ö", u"abc": [3]}
+ data = {"ö": "ö", "abc": [3]}
url = 'https://foo.bar/baz?x=y#a'
smug_url = smuggle_url(url, data)
unsmug_url, unsmug_data = unsmuggle_url(smug_url)
@@ -210,6 +215,8 @@ class TestUtil(unittest.TestCase):
def test_parse_duration(self):
self.assertEqual(parse_duration(None), None)
+ self.assertEqual(parse_duration(False), None)
+ self.assertEqual(parse_duration('invalid'), None)
self.assertEqual(parse_duration('1'), 1)
self.assertEqual(parse_duration('1337:12'), 80232)
self.assertEqual(parse_duration('9:12:43'), 33163)
@@ -227,6 +234,10 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_duration('0m0s'), 0)
self.assertEqual(parse_duration('0s'), 0)
self.assertEqual(parse_duration('01:02:03.05'), 3723.05)
+ self.assertEqual(parse_duration('T30M38S'), 1838)
+ self.assertEqual(parse_duration('5 s'), 5)
+ self.assertEqual(parse_duration('3 min'), 180)
+ self.assertEqual(parse_duration('2.5 hours'), 9000)
def test_fix_xml_ampersands(self):
self.assertEqual(
@@ -286,12 +297,17 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
+ self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266)
def test_strip_jsonp(self):
stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);')
d = json.loads(stripped)
self.assertEqual(d, [{"id": "532cb", "x": 3}])
+ stripped = strip_jsonp('parseMetadata({"STATUS":"OK"})\n\n\n//epc')
+ d = json.loads(stripped)
+ self.assertEqual(d, {'STATUS': 'OK'})
+
def test_uppercase_escape(self):
self.assertEqual(uppercase_escape('aä'), 'aä')
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
@@ -355,5 +371,79 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{"abc": true}')
self.assertEqual(json.loads(on), {'abc': True})
+ def test_clean_html(self):
+ self.assertEqual(clean_html('a:\nb'), 'a: b')
+ self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
+
+ def test_intlist_to_bytes(self):
+ self.assertEqual(
+ intlist_to_bytes([0, 1, 127, 128, 255]),
+ b'\x00\x01\x7f\x80\xff')
+
+ def test_args_to_str(self):
+ self.assertEqual(
+ args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
+ 'foo ba/r -baz \'2 be\' \'\''
+ )
+
+ def test_parse_filesize(self):
+ self.assertEqual(parse_filesize(None), None)
+ self.assertEqual(parse_filesize(''), None)
+ self.assertEqual(parse_filesize('91 B'), 91)
+ self.assertEqual(parse_filesize('foobar'), None)
+ self.assertEqual(parse_filesize('2 MiB'), 2097152)
+ self.assertEqual(parse_filesize('5 GB'), 5000000000)
+ self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
+ self.assertEqual(parse_filesize('1,24 KB'), 1240)
+
+ def test_version_tuple(self):
+ self.assertEqual(version_tuple('1'), (1,))
+ self.assertEqual(version_tuple('10.23.344'), (10, 23, 344))
+ self.assertEqual(version_tuple('10.1-6'), (10, 1, 6)) # avconv style
+
+ def test_detect_exe_version(self):
+ self.assertEqual(detect_exe_version('''ffmpeg version 1.2.1
+built on May 27 2013 08:37:26 with gcc 4.7 (Debian 4.7.3-4)
+configuration: --prefix=/usr --extra-'''), '1.2.1')
+ self.assertEqual(detect_exe_version('''ffmpeg version N-63176-g1fb4685
+built on May 15 2014 22:09:06 with gcc 4.8.2 (GCC)'''), 'N-63176-g1fb4685')
+ self.assertEqual(detect_exe_version('''X server found. dri2 connection failed!
+Trying to open render node...
+Success at /dev/dri/renderD128.
+ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
+
+ def test_age_restricted(self):
+ self.assertFalse(age_restricted(None, 10)) # unrestricted content
+ self.assertFalse(age_restricted(1, None)) # unrestricted policy
+ self.assertFalse(age_restricted(8, 10))
+ self.assertTrue(age_restricted(18, 14))
+ self.assertFalse(age_restricted(18, 18))
+
+ def test_is_html(self):
+ self.assertFalse(is_html(b'\x49\x44\x43<html'))
+ self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa'))
+ self.assertTrue(is_html( # UTF-8 with BOM
+ b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa'))
+ self.assertTrue(is_html( # UTF-16-LE
+ b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00'
+ ))
+ self.assertTrue(is_html( # UTF-16-BE
+ b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4'
+ ))
+ self.assertTrue(is_html( # UTF-32-BE
+ b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4'))
+ self.assertTrue(is_html( # UTF-32-LE
+ b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00'))
+
+ def test_render_table(self):
+ self.assertEqual(
+ render_table(
+ ['a', 'bcd'],
+ [[123, 4], [9999, 51]]),
+ 'a bcd\n'
+ '123 4\n'
+ '9999 51')
+
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py
index eac53b285..780636c77 100644
--- a/test/test_write_annotations.py
+++ b/test/test_write_annotations.py
@@ -1,5 +1,6 @@
#!/usr/bin/env python
# coding: utf-8
+from __future__ import unicode_literals
# Allow direct execution
import os
@@ -31,19 +32,18 @@ params = get_params({
})
-
TEST_ID = 'gr51aVj-mLg'
ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml'
EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label']
+
class TestAnnotations(unittest.TestCase):
def setUp(self):
# Clear old files
self.tearDown()
-
def test_info_json(self):
- expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text.
+ expected = list(EXPECTED_ANNOTATIONS) # Two annotations could have the same text.
ie = youtube_dl.extractor.YoutubeIE()
ydl = YoutubeDL(params)
ydl.add_info_extractor(ie)
@@ -51,7 +51,7 @@ class TestAnnotations(unittest.TestCase):
self.assertTrue(os.path.exists(ANNOTATIONS_FILE))
annoxml = None
with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof:
- annoxml = xml.etree.ElementTree.parse(annof)
+ annoxml = xml.etree.ElementTree.parse(annof)
self.assertTrue(annoxml is not None, 'Failed to parse annotations XML')
root = annoxml.getroot()
self.assertEqual(root.tag, 'document')
@@ -59,18 +59,17 @@ class TestAnnotations(unittest.TestCase):
self.assertEqual(annotationsTag.tag, 'annotations')
annotations = annotationsTag.findall('annotation')
- #Not all the annotations have TEXT children and the annotations are returned unsorted.
+ # Not all the annotations have TEXT children and the annotations are returned unsorted.
for a in annotations:
- self.assertEqual(a.tag, 'annotation')
- if a.get('type') == 'text':
- textTag = a.find('TEXT')
- text = textTag.text
- self.assertTrue(text in expected) #assertIn only added in python 2.7
- #remove the first occurance, there could be more than one annotation with the same text
- expected.remove(text)
- #We should have seen (and removed) all the expected annotation texts.
+ self.assertEqual(a.tag, 'annotation')
+ if a.get('type') == 'text':
+ textTag = a.find('TEXT')
+ text = textTag.text
+ self.assertTrue(text in expected) # assertIn only added in python 2.7
+ # remove the first occurance, there could be more than one annotation with the same text
+ expected.remove(text)
+ # We should have seen (and removed) all the expected annotation texts.
self.assertEqual(len(expected), 0, 'Not all expected annotations were found.')
-
def tearDown(self):
try_rm(ANNOTATIONS_FILE)
diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py
deleted file mode 100644
index 90426a559..000000000
--- a/test/test_write_info_json.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-# Allow direct execution
-import os
-import sys
-import unittest
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from test.helper import get_params
-
-
-import io
-import json
-
-import youtube_dl.YoutubeDL
-import youtube_dl.extractor
-
-
-class YoutubeDL(youtube_dl.YoutubeDL):
- def __init__(self, *args, **kwargs):
- super(YoutubeDL, self).__init__(*args, **kwargs)
- self.to_stderr = self.to_screen
-
-params = get_params({
- 'writeinfojson': True,
- 'skip_download': True,
- 'writedescription': True,
-})
-
-
-TEST_ID = 'BaW_jenozKc'
-INFO_JSON_FILE = TEST_ID + '.info.json'
-DESCRIPTION_FILE = TEST_ID + '.mp4.description'
-EXPECTED_DESCRIPTION = u'''test chars: "'/\ä↭𝕐
-test URL: https://github.com/rg3/youtube-dl/issues/1892
-
-This is a test video for youtube-dl.
-
-For more information, contact phihag@phihag.de .'''
-
-
-class TestInfoJSON(unittest.TestCase):
- def setUp(self):
- # Clear old files
- self.tearDown()
-
- def test_info_json(self):
- ie = youtube_dl.extractor.YoutubeIE()
- ydl = YoutubeDL(params)
- ydl.add_info_extractor(ie)
- ydl.download([TEST_ID])
- self.assertTrue(os.path.exists(INFO_JSON_FILE))
- with io.open(INFO_JSON_FILE, 'r', encoding='utf-8') as jsonf:
- jd = json.load(jsonf)
- self.assertEqual(jd['upload_date'], u'20121002')
- self.assertEqual(jd['description'], EXPECTED_DESCRIPTION)
- self.assertEqual(jd['id'], TEST_ID)
- self.assertEqual(jd['extractor'], 'youtube')
- self.assertEqual(jd['title'], u'''youtube-dl test video "'/\ä↭𝕐''')
- self.assertEqual(jd['uploader'], 'Philipp Hagemeister')
-
- self.assertTrue(os.path.exists(DESCRIPTION_FILE))
- with io.open(DESCRIPTION_FILE, 'r', encoding='utf-8') as descf:
- descr = descf.read()
- self.assertEqual(descr, EXPECTED_DESCRIPTION)
-
- def tearDown(self):
- if os.path.exists(INFO_JSON_FILE):
- os.remove(INFO_JSON_FILE)
- if os.path.exists(DESCRIPTION_FILE):
- os.remove(DESCRIPTION_FILE)
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
index 410f9edc2..c889b6f15 100644
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
# Allow direct execution
import os
@@ -12,10 +13,6 @@ from test.helper import FakeYDL
from youtube_dl.extractor import (
YoutubePlaylistIE,
YoutubeIE,
- YoutubeChannelIE,
- YoutubeShowIE,
- YoutubeTopListIE,
- YoutubeSearchURLIE,
)
@@ -31,7 +28,7 @@ class TestYoutubeLists(unittest.TestCase):
result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
self.assertEqual(result['_type'], 'url')
self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg')
-
+
def test_youtube_course(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
index df2cb09f2..13d228cd8 100644
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -14,7 +14,7 @@ import re
import string
from youtube_dl.extractor import YoutubeIE
-from youtube_dl.utils import compat_str, compat_urlretrieve
+from youtube_dl.compat import compat_str, compat_urlretrieve
_TESTS = [
(
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index dec0e20e7..0e73dc8ff 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -7,8 +7,10 @@ import collections
import datetime
import errno
import io
+import itertools
import json
import locale
+import operator
import os
import platform
import re
@@ -22,12 +24,16 @@ import traceback
if os.name == 'nt':
import ctypes
-from .utils import (
+from .compat import (
compat_cookiejar,
+ compat_expanduser,
compat_http_client,
+ compat_kwargs,
compat_str,
compat_urllib_error,
compat_urllib_request,
+)
+from .utils import (
escape_url,
ContentTooShortError,
date_from_str,
@@ -44,24 +50,37 @@ from .utils import (
make_HTTPS_handler,
MaxDownloadsReached,
PagedList,
+ parse_filesize,
PostProcessingError,
platform_name,
preferredencoding,
+ render_table,
SameFileError,
sanitize_filename,
+ std_headers,
subtitles_filename,
takewhile_inclusive,
UnavailableVideoError,
url_basename,
+ version_tuple,
write_json_file,
write_string,
YoutubeDLHandler,
prepend_extension,
+ args_to_str,
+ age_restricted,
)
from .cache import Cache
from .extractor import get_info_extractor, gen_extractors
from .downloader import get_suitable_downloader
-from .postprocessor import FFmpegMergerPP
+from .downloader.rtmp import rtmpdump_version
+from .postprocessor import (
+ FFmpegFixupM4aPP,
+ FFmpegFixupStretchedPP,
+ FFmpegMergerPP,
+ FFmpegPostProcessor,
+ get_postprocessor,
+)
from .version import __version__
@@ -107,8 +126,10 @@ class YoutubeDL(object):
forcefilename: Force printing final filename.
forceduration: Force printing duration.
forcejson: Force printing info_dict as JSON.
+ dump_single_json: Force printing the info_dict of the whole playlist
+ (or video) as a single JSON line.
simulate: Do not download the video files.
- format: Video format code.
+ format: Video format code. See options.py for more information.
format_limit: Highest quality format to try.
outtmpl: Template for output names.
restrictfilenames: Do not allow "&" and spaces in file names
@@ -116,6 +137,7 @@ class YoutubeDL(object):
nooverwrites: Prevent overwriting files.
playliststart: Playlist item to start at.
playlistend: Playlist item to end at.
+ playlistreverse: Download playlist items in reverse order.
matchtitle: Download only matching titles.
rejecttitle: Reject downloads for matching titles.
logger: Log messages to a logging.Logger instance.
@@ -124,6 +146,7 @@ class YoutubeDL(object):
writeinfojson: Write the video description to a .info.json file
writeannotations: Write the video annotations to a .annotations.xml file
writethumbnail: Write the thumbnail image to a file
+ write_all_thumbnails: Write all thumbnail formats to files
writesubtitles: Write the video subtitles to a file
writeautomaticsub: Write the automatic subtitles to a file
allsubtitles: Downloads all the subtitles of the video
@@ -165,6 +188,44 @@ class YoutubeDL(object):
'auto' for elaborate guessing
encoding: Use this encoding instead of the system-specified.
extract_flat: Do not resolve URLs, return the immediate result.
+ Pass in 'in_playlist' to only show this behavior for
+ playlist items.
+ postprocessors: A list of dictionaries, each with an entry
+ * key: The name of the postprocessor. See
+ youtube_dl/postprocessor/__init__.py for a list.
+ as well as any further keyword arguments for the
+ postprocessor.
+ progress_hooks: A list of functions that get called on download
+ progress, with a dictionary with the entries
+ * filename: The final filename
+ * status: One of "downloading" and "finished"
+
+ The dict may also have some of the following entries:
+
+ * downloaded_bytes: Bytes on disk
+ * total_bytes: Size of the whole file, None if unknown
+ * tmpfilename: The filename we're currently writing to
+ * eta: The estimated time in seconds, None if unknown
+ * speed: The download speed in bytes/second, None if
+ unknown
+
+ Progress hooks are guaranteed to be called at least once
+ (with status "finished") if the download is successful.
+ merge_output_format: Extension to use when merging formats.
+ fixup: Automatically correct known faults of the file.
+ One of:
+ - "never": do nothing
+ - "warn": only emit a warning
+ - "detect_or_warn": check whether we can do anything
+ about it, warn otherwise (default)
+ source_address: (Experimental) Client-side IP address to bind to.
+ call_home: Boolean, true iff we are allowed to contact the
+ youtube-dl servers for debugging.
+ sleep_interval: Number of seconds to sleep before each download.
+ external_downloader: Executable of the external downloader to call.
+ listformats: Print an overview of available video formats and exit.
+ list_thumbnails: Print a table of all thumbnails and exit.
+
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@@ -184,7 +245,7 @@ class YoutubeDL(object):
_num_downloads = None
_screen_file = None
- def __init__(self, params=None):
+ def __init__(self, params=None, auto_init=True):
"""Create a FileDownloader object with the given options."""
if params is None:
params = {}
@@ -241,6 +302,36 @@ class YoutubeDL(object):
self._setup_opener()
+ if auto_init:
+ self.print_debug_header()
+ self.add_default_info_extractors()
+
+ for pp_def_raw in self.params.get('postprocessors', []):
+ pp_class = get_postprocessor(pp_def_raw['key'])
+ pp_def = dict(pp_def_raw)
+ del pp_def['key']
+ pp = pp_class(self, **compat_kwargs(pp_def))
+ self.add_post_processor(pp)
+
+ for ph in self.params.get('progress_hooks', []):
+ self.add_progress_hook(ph)
+
+ def warn_if_short_id(self, argv):
+ # short YouTube ID starting with dash?
+ idxs = [
+ i for i, a in enumerate(argv)
+ if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
+ if idxs:
+ correct_argv = (
+ ['youtube-dl'] +
+ [a for i, a in enumerate(argv) if i not in idxs] +
+ ['--'] + [argv[i] for i in idxs]
+ )
+ self.report_warning(
+ 'Long argument string detected. '
+ 'Use -- to separate parameters and URLs, like this:\n%s\n' %
+ args_to_str(correct_argv))
+
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
self._ies.append(ie)
@@ -285,7 +376,7 @@ class YoutubeDL(object):
self._output_process.stdin.write((message + '\n').encode('utf-8'))
self._output_process.stdin.flush()
res = ''.join(self._output_channel.readline().decode('utf-8')
- for _ in range(line_count))
+ for _ in range(line_count))
return res[:-len('\n')]
def to_screen(self, message, skip_eol=False):
@@ -447,7 +538,7 @@ class YoutubeDL(object):
template_dict = collections.defaultdict(lambda: 'NA', template_dict)
outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
- tmpl = os.path.expanduser(outtmpl)
+ tmpl = compat_expanduser(outtmpl)
filename = tmpl % template_dict
return filename
except ValueError as err:
@@ -482,13 +573,8 @@ class YoutubeDL(object):
max_views = self.params.get('max_views')
if max_views is not None and view_count > max_views:
return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
- age_limit = self.params.get('age_limit')
- if age_limit is not None:
- actual_age_limit = info_dict.get('age_limit')
- if actual_age_limit is None:
- actual_age_limit = 0
- if age_limit < actual_age_limit:
- return 'Skipping "' + title + '" because it is age restricted'
+ if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
+ return 'Skipping "%s" because it is age restricted' % title
if self.in_download_archive(info_dict):
return '%s has already been recorded in archive' % video_title
return None
@@ -522,7 +608,7 @@ class YoutubeDL(object):
try:
ie_result = ie.extract(url)
- if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
+ if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
break
if isinstance(ie_result, list):
# Backwards compatibility: old IE result format
@@ -535,7 +621,7 @@ class YoutubeDL(object):
return self.process_ie_result(ie_result, download, extra_info)
else:
return ie_result
- except ExtractorError as de: # An error we somewhat expected
+ except ExtractorError as de: # An error we somewhat expected
self.report_error(compat_str(de), de.format_traceback())
break
except MaxDownloadsReached:
@@ -568,8 +654,12 @@ class YoutubeDL(object):
result_type = ie_result.get('_type', 'video')
- if self.params.get('extract_flat', False):
- if result_type in ('url', 'url_transparent'):
+ if result_type in ('url', 'url_transparent'):
+ extract_flat = self.params.get('extract_flat', False)
+ if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
+ extract_flat is True):
+ if self.params.get('forcejson', False):
+ self.to_stdout(json.dumps(ie_result))
return ie_result
if result_type == 'video':
@@ -588,27 +678,19 @@ class YoutubeDL(object):
ie_result['url'], ie_key=ie_result.get('ie_key'),
extra_info=extra_info, download=False, process=False)
- def make_result(embedded_info):
- new_result = ie_result.copy()
- for f in ('_type', 'url', 'ext', 'player_url', 'formats',
- 'entries', 'ie_key', 'duration',
- 'subtitles', 'annotations', 'format',
- 'thumbnail', 'thumbnails'):
- if f in new_result:
- del new_result[f]
- if f in embedded_info:
- new_result[f] = embedded_info[f]
- return new_result
- new_result = make_result(info)
+ force_properties = dict(
+ (k, v) for k, v in ie_result.items() if v is not None)
+ for f in ('_type', 'url'):
+ if f in force_properties:
+ del force_properties[f]
+ new_result = info.copy()
+ new_result.update(force_properties)
assert new_result.get('_type') != 'url_transparent'
- if new_result.get('_type') == 'compat_list':
- new_result['entries'] = [
- make_result(e) for e in new_result['entries']]
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
- elif result_type == 'playlist':
+ elif result_type == 'playlist' or result_type == 'multi_video':
# We process each entry in the playlist
playlist = ie_result.get('title', None) or ie_result.get('id', None)
self.to_screen('[download] Downloading playlist: %s' % playlist)
@@ -621,27 +703,39 @@ class YoutubeDL(object):
if playlistend == -1:
playlistend = None
- if isinstance(ie_result['entries'], list):
- n_all_entries = len(ie_result['entries'])
- entries = ie_result['entries'][playliststart:playlistend]
+ ie_entries = ie_result['entries']
+ if isinstance(ie_entries, list):
+ n_all_entries = len(ie_entries)
+ entries = ie_entries[playliststart:playlistend]
n_entries = len(entries)
self.to_screen(
"[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
(ie_result['extractor'], playlist, n_all_entries, n_entries))
- else:
- assert isinstance(ie_result['entries'], PagedList)
- entries = ie_result['entries'].getslice(
+ elif isinstance(ie_entries, PagedList):
+ entries = ie_entries.getslice(
playliststart, playlistend)
n_entries = len(entries)
self.to_screen(
"[%s] playlist %s: Downloading %d videos" %
(ie_result['extractor'], playlist, n_entries))
+ else: # iterable
+ entries = list(itertools.islice(
+ ie_entries, playliststart, playlistend))
+ n_entries = len(entries)
+ self.to_screen(
+ "[%s] playlist %s: Downloading %d videos" %
+ (ie_result['extractor'], playlist, n_entries))
+
+ if self.params.get('playlistreverse', False):
+ entries = entries[::-1]
for i, entry in enumerate(entries, 1):
- self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
+ self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
extra = {
'n_entries': n_entries,
'playlist': playlist,
+ 'playlist_id': ie_result.get('id'),
+ 'playlist_title': ie_result.get('title'),
'playlist_index': i + playliststart,
'extractor': ie_result['extractor'],
'webpage_url': ie_result['webpage_url'],
@@ -661,14 +755,20 @@ class YoutubeDL(object):
ie_result['entries'] = playlist_results
return ie_result
elif result_type == 'compat_list':
+ self.report_warning(
+ 'Extractor %s returned a compat_list result. '
+ 'It needs to be updated.' % ie_result.get('extractor'))
+
def _fixup(r):
- self.add_extra_info(r,
+ self.add_extra_info(
+ r,
{
'extractor': ie_result['extractor'],
'webpage_url': ie_result['webpage_url'],
'webpage_url_basename': url_basename(ie_result['webpage_url']),
'extractor_key': ie_result['extractor_key'],
- })
+ }
+ )
return r
ie_result['entries'] = [
self.process_ie_result(_fixup(r), download, extra_info)
@@ -678,7 +778,59 @@ class YoutubeDL(object):
else:
raise Exception('Invalid result type: %s' % result_type)
+ def _apply_format_filter(self, format_spec, available_formats):
+ " Returns a tuple of the remaining format_spec and filtered formats "
+
+ OPERATORS = {
+ '<': operator.lt,
+ '<=': operator.le,
+ '>': operator.gt,
+ '>=': operator.ge,
+ '=': operator.eq,
+ '!=': operator.ne,
+ }
+ operator_rex = re.compile(r'''(?x)\s*\[
+ (?P<key>width|height|tbr|abr|vbr|filesize)
+ \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+ (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
+ \]$
+ ''' % '|'.join(map(re.escape, OPERATORS.keys())))
+ m = operator_rex.search(format_spec)
+ if not m:
+ raise ValueError('Invalid format specification %r' % format_spec)
+
+ try:
+ comparison_value = int(m.group('value'))
+ except ValueError:
+ comparison_value = parse_filesize(m.group('value'))
+ if comparison_value is None:
+ comparison_value = parse_filesize(m.group('value') + 'B')
+ if comparison_value is None:
+ raise ValueError(
+ 'Invalid value %r in format specification %r' % (
+ m.group('value'), format_spec))
+ op = OPERATORS[m.group('op')]
+
+ def _filter(f):
+ actual_value = f.get(m.group('key'))
+ if actual_value is None:
+ return m.group('none_inclusive')
+ return op(actual_value, comparison_value)
+ new_formats = [f for f in available_formats if _filter(f)]
+
+ new_format_spec = format_spec[:-len(m.group(0))]
+ if not new_format_spec:
+ new_format_spec = 'best'
+
+ return (new_format_spec, new_formats)
+
def select_format(self, format_spec, available_formats):
+ while format_spec.endswith(']'):
+ format_spec, available_formats = self._apply_format_filter(
+ format_spec, available_formats)
+ if not available_formats:
+ return None
+
if format_spec == 'best' or format_spec is None:
return available_formats[-1]
elif format_spec == 'worst':
@@ -708,7 +860,7 @@ class YoutubeDL(object):
if video_formats:
return video_formats[0]
else:
- extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a']
+ extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
if format_spec in extensions:
filter_f = lambda f: f['ext'] == format_spec
else:
@@ -718,6 +870,42 @@ class YoutubeDL(object):
return matches[-1]
return None
+ def _calc_headers(self, info_dict):
+ res = std_headers.copy()
+
+ add_headers = info_dict.get('http_headers')
+ if add_headers:
+ res.update(add_headers)
+
+ cookies = self._calc_cookies(info_dict)
+ if cookies:
+ res['Cookie'] = cookies
+
+ return res
+
+ def _calc_cookies(self, info_dict):
+ class _PseudoRequest(object):
+ def __init__(self, url):
+ self.url = url
+ self.headers = {}
+ self.unverifiable = False
+
+ def add_unredirected_header(self, k, v):
+ self.headers[k] = v
+
+ def get_full_url(self):
+ return self.url
+
+ def is_unverifiable(self):
+ return self.unverifiable
+
+ def has_header(self, h):
+ return h in self.headers
+
+ pr = _PseudoRequest(info_dict['url'])
+ self.cookiejar.add_cookie_header(pr)
+ return pr.headers.get('Cookie')
+
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
@@ -732,9 +920,14 @@ class YoutubeDL(object):
info_dict['playlist_index'] = None
thumbnails = info_dict.get('thumbnails')
+ if thumbnails is None:
+ thumbnail = info_dict.get('thumbnail')
+ if thumbnail:
+ thumbnails = [{'url': thumbnail}]
if thumbnails:
thumbnails.sort(key=lambda t: (
- t.get('width'), t.get('height'), t.get('url')))
+ t.get('preference'), t.get('width'), t.get('height'),
+ t.get('id'), t.get('url')))
for t in thumbnails:
if 'width' in t and 'height' in t:
t['resolution'] = '%dx%d' % (t['width'], t['height'])
@@ -746,6 +939,10 @@ class YoutubeDL(object):
info_dict['display_id'] = info_dict['id']
if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
+ # Working around negative timestamps in Windows
+ # (see http://bugs.python.org/issue1646728)
+ if info_dict['timestamp'] < 0 and os.name == 'nt':
+ info_dict['timestamp'] = 0
upload_date = datetime.datetime.utcfromtimestamp(
info_dict['timestamp'])
info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
@@ -782,6 +979,11 @@ class YoutubeDL(object):
# Automatically determine file extension if missing
if 'ext' not in format:
format['ext'] = determine_ext(format['url']).lower()
+ # Add HTTP headers, so that external programs can use them from the
+ # json output
+ full_format_info = info_dict.copy()
+ full_format_info.update(format)
+ format['http_headers'] = self._calc_headers(full_format_info)
format_limit = self.params.get('format_limit', None)
if format_limit:
@@ -797,9 +999,12 @@ class YoutubeDL(object):
# element in the 'formats' field in info_dict is info_dict itself,
# wich can't be exported to json
info_dict['formats'] = formats
- if self.params.get('listformats', None):
+ if self.params.get('listformats'):
self.list_formats(info_dict)
return
+ if self.params.get('list_thumbnails'):
+ self.list_thumbnails(info_dict)
+ return
req_format = self.params.get('format')
if req_format is None:
@@ -818,12 +1023,33 @@ class YoutubeDL(object):
# Two formats have been requested like '137+139'
format_1, format_2 = rf.split('+')
formats_info = (self.select_format(format_1, formats),
- self.select_format(format_2, formats))
+ self.select_format(format_2, formats))
if all(formats_info):
+ # The first format must contain the video and the
+ # second the audio
+ if formats_info[0].get('vcodec') == 'none':
+ self.report_error('The first format must '
+ 'contain the video, try using '
+ '"-f %s+%s"' % (format_2, format_1))
+ return
+ output_ext = (
+ formats_info[0]['ext']
+ if self.params.get('merge_output_format') is None
+ else self.params['merge_output_format'])
selected_format = {
'requested_formats': formats_info,
'format': rf,
'ext': formats_info[0]['ext'],
+ 'width': formats_info[0].get('width'),
+ 'height': formats_info[0].get('height'),
+ 'resolution': formats_info[0].get('resolution'),
+ 'fps': formats_info[0].get('fps'),
+ 'vcodec': formats_info[0].get('vcodec'),
+ 'vbr': formats_info[0].get('vbr'),
+ 'stretched_ratio': formats_info[0].get('stretched_ratio'),
+ 'acodec': formats_info[1].get('acodec'),
+ 'abr': formats_info[1].get('abr'),
+ 'ext': output_ext,
}
else:
selected_format = None
@@ -882,8 +1108,12 @@ class YoutubeDL(object):
if self.params.get('forceid', False):
self.to_stdout(info_dict['id'])
if self.params.get('forceurl', False):
- # For RTMP URLs, also include the playpath
- self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
+ if info_dict.get('requested_formats') is not None:
+ for f in info_dict['requested_formats']:
+ self.to_stdout(f['url'] + f.get('play_path', ''))
+ else:
+ # For RTMP URLs, also include the playpath
+ self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
self.to_stdout(info_dict['thumbnail'])
if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
@@ -897,6 +1127,8 @@ class YoutubeDL(object):
if self.params.get('forcejson', False):
info_dict['_filename'] = filename
self.to_stdout(json.dumps(info_dict))
+ if self.params.get('dump_single_json', False):
+ info_dict['_filename'] = filename
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
@@ -917,13 +1149,13 @@ class YoutubeDL(object):
descfn = filename + '.description'
if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
self.to_screen('[info] Video description is already present')
+ elif info_dict.get('description') is None:
+ self.report_warning('There\'s no description to write.')
else:
try:
self.to_screen('[info] Writing video description to: ' + descfn)
with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
descfile.write(info_dict['description'])
- except (KeyError, TypeError):
- self.report_warning('There\'s no description to write.')
except (OSError, IOError):
self.report_error('Cannot write description file ' + descfn)
return
@@ -962,7 +1194,7 @@ class YoutubeDL(object):
else:
self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
- subfile.write(sub)
+ subfile.write(sub)
except (OSError, IOError):
self.report_error('Cannot write subtitles file ' + sub_filename)
return
@@ -974,84 +1206,100 @@ class YoutubeDL(object):
else:
self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
try:
- write_json_file(info_dict, encodeFilename(infofn))
+ write_json_file(info_dict, infofn)
except (OSError, IOError):
self.report_error('Cannot write metadata to JSON file ' + infofn)
return
- if self.params.get('writethumbnail', False):
- if info_dict.get('thumbnail') is not None:
- thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
- thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
- if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
- self.to_screen('[%s] %s: Thumbnail is already present' %
- (info_dict['extractor'], info_dict['id']))
- else:
- self.to_screen('[%s] %s: Downloading thumbnail ...' %
- (info_dict['extractor'], info_dict['id']))
- try:
- uf = self.urlopen(info_dict['thumbnail'])
- with open(thumb_filename, 'wb') as thumbf:
- shutil.copyfileobj(uf, thumbf)
- self.to_screen('[%s] %s: Writing thumbnail to: %s' %
- (info_dict['extractor'], info_dict['id'], thumb_filename))
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self.report_warning('Unable to download thumbnail "%s": %s' %
- (info_dict['thumbnail'], compat_str(err)))
+ self._write_thumbnails(info_dict, filename)
if not self.params.get('skip_download', False):
- if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
- success = True
- else:
- try:
- def dl(name, info):
- fd = get_suitable_downloader(info)(self, self.params)
- for ph in self._progress_hooks:
- fd.add_progress_hook(ph)
- if self.params.get('verbose'):
- self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
- return fd.download(name, info)
- if info_dict.get('requested_formats') is not None:
- downloaded = []
- success = True
- merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
- if not merger._get_executable():
- postprocessors = []
- self.report_warning('You have requested multiple '
- 'formats but ffmpeg or avconv are not installed.'
- ' The formats won\'t be merged')
- else:
- postprocessors = [merger]
- for f in info_dict['requested_formats']:
- new_info = dict(info_dict)
- new_info.update(f)
- fname = self.prepare_filename(new_info)
- fname = prepend_extension(fname, 'f%s' % f['format_id'])
- downloaded.append(fname)
- partial_success = dl(fname, new_info)
- success = success and partial_success
- info_dict['__postprocessors'] = postprocessors
- info_dict['__files_to_merge'] = downloaded
+ try:
+ def dl(name, info):
+ fd = get_suitable_downloader(info, self.params)(self, self.params)
+ for ph in self._progress_hooks:
+ fd.add_progress_hook(ph)
+ if self.params.get('verbose'):
+ self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
+ return fd.download(name, info)
+ if info_dict.get('requested_formats') is not None:
+ downloaded = []
+ success = True
+ merger = FFmpegMergerPP(self, not self.params.get('keepvideo'))
+ if not merger._executable:
+ postprocessors = []
+ self.report_warning('You have requested multiple '
+ 'formats but ffmpeg or avconv are not installed.'
+ ' The formats won\'t be merged')
else:
- # Just a single file
- success = dl(filename, info_dict)
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self.report_error('unable to download video data: %s' % str(err))
- return
- except (OSError, IOError) as err:
- raise UnavailableVideoError(err)
- except (ContentTooShortError, ) as err:
- self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
- return
+ postprocessors = [merger]
+ for f in info_dict['requested_formats']:
+ new_info = dict(info_dict)
+ new_info.update(f)
+ fname = self.prepare_filename(new_info)
+ fname = prepend_extension(fname, 'f%s' % f['format_id'])
+ downloaded.append(fname)
+ partial_success = dl(fname, new_info)
+ success = success and partial_success
+ info_dict['__postprocessors'] = postprocessors
+ info_dict['__files_to_merge'] = downloaded
+ else:
+ # Just a single file
+ success = dl(filename, info_dict)
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self.report_error('unable to download video data: %s' % str(err))
+ return
+ except (OSError, IOError) as err:
+ raise UnavailableVideoError(err)
+ except (ContentTooShortError, ) as err:
+ self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
+ return
if success:
+ # Fixup content
+ fixup_policy = self.params.get('fixup')
+ if fixup_policy is None:
+ fixup_policy = 'detect_or_warn'
+
+ stretched_ratio = info_dict.get('stretched_ratio')
+ if stretched_ratio is not None and stretched_ratio != 1:
+ if fixup_policy == 'warn':
+ self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
+ info_dict['id'], stretched_ratio))
+ elif fixup_policy == 'detect_or_warn':
+ stretched_pp = FFmpegFixupStretchedPP(self)
+ if stretched_pp.available:
+ info_dict.setdefault('__postprocessors', [])
+ info_dict['__postprocessors'].append(stretched_pp)
+ else:
+ self.report_warning(
+ '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
+ info_dict['id'], stretched_ratio))
+ else:
+ assert fixup_policy in ('ignore', 'never')
+
+ if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
+ if fixup_policy == 'warn':
+ self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
+ info_dict['id']))
+ elif fixup_policy == 'detect_or_warn':
+ fixup_pp = FFmpegFixupM4aPP(self)
+ if fixup_pp.available:
+ info_dict.setdefault('__postprocessors', [])
+ info_dict['__postprocessors'].append(fixup_pp)
+ else:
+ self.report_warning(
+ '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
+ info_dict['id']))
+ else:
+ assert fixup_policy in ('ignore', 'never')
+
try:
self.post_process(filename, info_dict)
except (PostProcessingError) as err:
self.report_error('postprocessing: %s' % str(err))
return
-
- self.record_download_archive(info_dict)
+ self.record_download_archive(info_dict)
def download(self, url_list):
"""Download a given list of URLs."""
@@ -1063,13 +1311,16 @@ class YoutubeDL(object):
for url in url_list:
try:
- #It also downloads the videos
- self.extract_info(url)
+ # It also downloads the videos
+ res = self.extract_info(url)
except UnavailableVideoError:
self.report_error('unable to download video')
except MaxDownloadsReached:
self.to_screen('[info] Maximum number of downloaded files reached.')
raise
+ else:
+ if self.params.get('dump_single_json', False):
+ self.to_stdout(json.dumps(res))
return self._download_retcode
@@ -1091,14 +1342,15 @@ class YoutubeDL(object):
"""Run all the postprocessors on the given file."""
info = dict(ie_info)
info['filepath'] = filename
- keep_video = None
pps_chain = []
if ie_info.get('__postprocessors') is not None:
pps_chain.extend(ie_info['__postprocessors'])
pps_chain.extend(self._pps)
for pp in pps_chain:
+ keep_video = None
+ old_filename = info['filepath']
try:
- keep_video_wish, new_info = pp.run(info)
+ keep_video_wish, info = pp.run(info)
if keep_video_wish is not None:
if keep_video_wish:
keep_video = keep_video_wish
@@ -1107,12 +1359,12 @@ class YoutubeDL(object):
keep_video = keep_video_wish
except PostProcessingError as e:
self.report_error(e.msg)
- if keep_video is False and not self.params.get('keepvideo', False):
- try:
- self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
- os.remove(encodeFilename(filename))
- except (IOError, OSError):
- self.report_warning('Unable to remove downloaded video file')
+ if keep_video is False and not self.params.get('keepvideo', False):
+ try:
+ self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
+ os.remove(encodeFilename(old_filename))
+ except (IOError, OSError):
+ self.report_warning('Unable to remove downloaded video file')
def _make_archive_id(self, info_dict):
# Future-proof against any change in case
@@ -1193,6 +1445,8 @@ class YoutubeDL(object):
res += 'video@'
if fdict.get('vbr') is not None:
res += '%4dk' % fdict['vbr']
+ if fdict.get('fps') is not None:
+ res += ', %sfps' % fdict['fps']
if fdict.get('acodec') is not None:
if res:
res += ', '
@@ -1230,7 +1484,9 @@ class YoutubeDL(object):
formats = info_dict.get('formats', [info_dict])
idlen = max(len('format code'),
max(len(f['format_id']) for f in formats))
- formats_s = [line(f, idlen) for f in formats]
+ formats_s = [
+ line(f, idlen) for f in formats
+ if f.get('preference') is None or f['preference'] >= -1000]
if len(formats) > 1:
formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
@@ -1238,8 +1494,26 @@ class YoutubeDL(object):
header_line = line({
'format_id': 'format code', 'ext': 'extension',
'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
- self.to_screen('[info] Available formats for %s:\n%s\n%s' %
- (info_dict['id'], header_line, '\n'.join(formats_s)))
+ self.to_screen(
+ '[info] Available formats for %s:\n%s\n%s' %
+ (info_dict['id'], header_line, '\n'.join(formats_s)))
+
+ def list_thumbnails(self, info_dict):
+ thumbnails = info_dict.get('thumbnails')
+ if not thumbnails:
+ tn_url = info_dict.get('thumbnail')
+ if tn_url:
+ thumbnails = [{'id': '0', 'url': tn_url}]
+ else:
+ self.to_screen(
+ '[info] No thumbnails present for %s' % info_dict['id'])
+ return
+
+ self.to_screen(
+ '[info] Thumbnails for %s:' % info_dict['id'])
+ self.to_screen(render_table(
+ ['ID', 'width', 'height', 'URL'],
+ [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
def urlopen(self, req):
""" Start an HTTP download """
@@ -1274,11 +1548,13 @@ class YoutubeDL(object):
self.report_warning(
'Your Python is broken! Update to a newer and supported version')
+ stdout_encoding = getattr(
+ sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
encoding_str = (
'[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
locale.getpreferredencoding(),
sys.getfilesystemencoding(),
- sys.stdout.encoding,
+ stdout_encoding,
self.get_encoding()))
write_string(encoding_str, encoding=None)
@@ -1297,8 +1573,19 @@ class YoutubeDL(object):
sys.exc_clear()
except:
pass
- self._write_string('[debug] Python version %s - %s' %
- (platform.python_version(), platform_name()) + '\n')
+ self._write_string('[debug] Python version %s - %s\n' % (
+ platform.python_version(), platform_name()))
+
+ exe_versions = FFmpegPostProcessor.get_versions()
+ exe_versions['rtmpdump'] = rtmpdump_version()
+ exe_str = ', '.join(
+ '%s %s' % (exe, v)
+ for exe, v in sorted(exe_versions.items())
+ if v
+ )
+ if not exe_str:
+ exe_str = 'none'
+ self._write_string('[debug] exe versions: %s\n' % exe_str)
proxy_map = {}
for handler in self._opener.handlers:
@@ -1306,6 +1593,17 @@ class YoutubeDL(object):
proxy_map.update(handler.proxies)
self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
+ if self.params.get('call_home', False):
+ ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
+ self._write_string('[debug] Public IP address: %s\n' % ipaddr)
+ latest_version = self.urlopen(
+ 'https://yt-dl.org/latest/version').read().decode('utf-8')
+ if version_tuple(latest_version) > version_tuple(__version__):
+ self.report_warning(
+ 'You are using an outdated version (newest version: %s)! '
+ 'See https://yt-dl.org/update if you need help updating.' %
+ latest_version)
+
def _setup_opener(self):
timeout_val = self.params.get('socket_timeout')
self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
@@ -1336,9 +1634,8 @@ class YoutubeDL(object):
proxy_handler = compat_urllib_request.ProxyHandler(proxies)
debuglevel = 1 if self.params.get('debug_printtraffic') else 0
- https_handler = make_HTTPS_handler(
- self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
- ydlh = YoutubeDLHandler(debuglevel=debuglevel)
+ https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
+ ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
opener = compat_urllib_request.build_opener(
https_handler, proxy_handler, cookie_processor, ydlh)
# Delete the default user-agent header, which would otherwise apply in
@@ -1362,3 +1659,39 @@ class YoutubeDL(object):
if encoding is None:
encoding = preferredencoding()
return encoding
+
+ def _write_thumbnails(self, info_dict, filename):
+ if self.params.get('writethumbnail', False):
+ thumbnails = info_dict.get('thumbnails')
+ if thumbnails:
+ thumbnails = [thumbnails[-1]]
+ elif self.params.get('write_all_thumbnails', False):
+ thumbnails = info_dict.get('thumbnails')
+ else:
+ return
+
+ if not thumbnails:
+ # No thumbnails present, so return immediately
+ return
+
+ for t in thumbnails:
+ thumb_ext = determine_ext(t['url'], 'jpg')
+ suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
+ thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
+ thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+
+ if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
+ self.to_screen('[%s] %s: Thumbnail %sis already present' %
+ (info_dict['extractor'], info_dict['id'], thumb_display_id))
+ else:
+ self.to_screen('[%s] %s: Downloading thumbnail %s...' %
+ (info_dict['extractor'], info_dict['id'], thumb_display_id))
+ try:
+ uf = self.urlopen(t['url'])
+ with open(thumb_filename, 'wb') as thumbf:
+ shutil.copyfileobj(uf, thumbf)
+ self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
+ (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self.report_warning('Unable to download thumbnail "%s": %s' %
+ (t['url'], compat_str(err)))
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 7f2b4dfcc..04f668334 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -1,85 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-__authors__ = (
- 'Ricardo Garcia Gonzalez',
- 'Danny Colligan',
- 'Benjamin Johnson',
- 'Vasyl\' Vavrychuk',
- 'Witold Baryluk',
- 'Paweł Paprota',
- 'Gergely Imreh',
- 'Rogério Brito',
- 'Philipp Hagemeister',
- 'Sören Schulze',
- 'Kevin Ngo',
- 'Ori Avtalion',
- 'shizeeg',
- 'Filippo Valsorda',
- 'Christian Albrecht',
- 'Dave Vasilevsky',
- 'Jaime Marquínez Ferrándiz',
- 'Jeff Crouse',
- 'Osama Khalid',
- 'Michael Walter',
- 'M. Yasoob Ullah Khalid',
- 'Julien Fraichard',
- 'Johny Mo Swag',
- 'Axel Noack',
- 'Albert Kim',
- 'Pierre Rudloff',
- 'Huarong Huo',
- 'Ismael Mejía',
- 'Steffan \'Ruirize\' James',
- 'Andras Elso',
- 'Jelle van der Waa',
- 'Marcin Cieślak',
- 'Anton Larionov',
- 'Takuya Tsuchida',
- 'Sergey M.',
- 'Michael Orlitzky',
- 'Chris Gahan',
- 'Saimadhav Heblikar',
- 'Mike Col',
- 'Oleg Prutz',
- 'pulpe',
- 'Andreas Schmitz',
- 'Michael Kaiser',
- 'Niklas Laxström',
- 'David Triendl',
- 'Anthony Weems',
- 'David Wagner',
- 'Juan C. Olivares',
- 'Mattias Harrysson',
- 'phaer',
- 'Sainyam Kapoor',
- 'Nicolas Évrard',
- 'Jason Normore',
- 'Hoje Lee',
- 'Adam Thalhammer',
- 'Georg Jähnig',
- 'Ralf Haring',
- 'Koki Takahashi',
- 'Ariset Llerena',
- 'Adam Malcontenti-Wilson',
- 'Tobias Bell',
- 'Naglis Jonaitis',
- 'Charles Chen',
- 'Hassaan Ali',
- 'Dobrosław Żybort',
- 'David Fabijan',
- 'Sebastian Haas',
- 'Alexander Kirk',
- 'Erik Johnson',
- 'Keith Beckman',
- 'Ole Ernst',
- 'Aaron McDaniel (mcd1992)',
- 'Magnus Kolstad',
- 'Hari Padmanaban',
- 'Carlos Ramos',
- '5moufl',
- 'lenaten',
-)
+from __future__ import unicode_literals
__license__ = 'Public Domain'
@@ -93,9 +15,13 @@ import sys
from .options import (
parseOpts,
)
-from .utils import (
+from .compat import (
+ compat_expanduser,
compat_getpass,
compat_print,
+ workaround_optparse_bug9161,
+)
+from .utils import (
DateRange,
DEFAULT_OUTTMPL,
decodeOption,
@@ -112,18 +38,8 @@ from .update import update_self
from .downloader import (
FileDownloader,
)
-from .extractor import gen_extractors
+from .extractor import gen_extractors, list_extractors
from .YoutubeDL import YoutubeDL
-from .postprocessor import (
- AtomicParsleyPP,
- FFmpegAudioFixPP,
- FFmpegMetadataPP,
- FFmpegVideoConvertor,
- FFmpegExtractAudioPP,
- FFmpegEmbedSubtitlePP,
- XAttrMetadataPP,
- ExecAfterDownloadPP,
-)
def _real_main(argv=None):
@@ -132,7 +48,9 @@ def _real_main(argv=None):
# https://github.com/rg3/youtube-dl/issues/820
codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
- setproctitle(u'youtube-dl')
+ workaround_optparse_bug9161()
+
+ setproctitle('youtube-dl')
parser, opts, args = parseOpts(argv)
@@ -148,10 +66,10 @@ def _real_main(argv=None):
if opts.headers is not None:
for h in opts.headers:
if h.find(':', 1) < 0:
- parser.error(u'wrong header formatting, it should be key:value, not "%s"'%h)
+ parser.error('wrong header formatting, it should be key:value, not "%s"' % h)
key, value = h.split(':', 2)
if opts.verbose:
- write_string(u'[debug] Adding header from command line option %s:%s\n'%(key, value))
+ write_string('[debug] Adding header from command line option %s:%s\n' % (key, value))
std_headers[key] = value
# Dump user agent
@@ -169,94 +87,90 @@ def _real_main(argv=None):
batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
batch_urls = read_batch_urls(batchfd)
if opts.verbose:
- write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n')
+ write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
except IOError:
- sys.exit(u'ERROR: batch file could not be read')
+ sys.exit('ERROR: batch file could not be read')
all_urls = batch_urls + args
all_urls = [url.strip() for url in all_urls]
_enc = preferredencoding()
all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
- extractors = gen_extractors()
-
if opts.list_extractors:
- for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()):
+ for ie in list_extractors(opts.age_limit):
compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else ''))
matchedUrls = [url for url in all_urls if ie.suitable(url)]
for mu in matchedUrls:
- compat_print(u' ' + mu)
+ compat_print(' ' + mu)
sys.exit(0)
if opts.list_extractor_descriptions:
- for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()):
+ for ie in list_extractors(opts.age_limit):
if not ie._WORKING:
continue
desc = getattr(ie, 'IE_DESC', ie.IE_NAME)
if desc is False:
continue
if hasattr(ie, 'SEARCH_KEY'):
- _SEARCHES = (u'cute kittens', u'slithering pythons', u'falling cat', u'angry poodle', u'purple fish', u'running tortoise', u'sleeping bunny')
- _COUNTS = (u'', u'5', u'10', u'all')
- desc += u' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
+ _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
+ _COUNTS = ('', '5', '10', 'all')
+ desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
compat_print(desc)
sys.exit(0)
-
# Conflicting, missing and erroneous options
if opts.usenetrc and (opts.username is not None or opts.password is not None):
- parser.error(u'using .netrc conflicts with giving username/password')
+ parser.error('using .netrc conflicts with giving username/password')
if opts.password is not None and opts.username is None:
- parser.error(u'account username missing\n')
+ parser.error('account username missing\n')
if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid):
- parser.error(u'using output template conflicts with using title, video ID or auto number')
+ parser.error('using output template conflicts with using title, video ID or auto number')
if opts.usetitle and opts.useid:
- parser.error(u'using title conflicts with using video ID')
+ parser.error('using title conflicts with using video ID')
if opts.username is not None and opts.password is None:
- opts.password = compat_getpass(u'Type account password and press [Return]: ')
+ opts.password = compat_getpass('Type account password and press [Return]: ')
if opts.ratelimit is not None:
numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
if numeric_limit is None:
- parser.error(u'invalid rate limit specified')
+ parser.error('invalid rate limit specified')
opts.ratelimit = numeric_limit
if opts.min_filesize is not None:
numeric_limit = FileDownloader.parse_bytes(opts.min_filesize)
if numeric_limit is None:
- parser.error(u'invalid min_filesize specified')
+ parser.error('invalid min_filesize specified')
opts.min_filesize = numeric_limit
if opts.max_filesize is not None:
numeric_limit = FileDownloader.parse_bytes(opts.max_filesize)
if numeric_limit is None:
- parser.error(u'invalid max_filesize specified')
+ parser.error('invalid max_filesize specified')
opts.max_filesize = numeric_limit
if opts.retries is not None:
try:
opts.retries = int(opts.retries)
except (TypeError, ValueError):
- parser.error(u'invalid retry count specified')
+ parser.error('invalid retry count specified')
if opts.buffersize is not None:
numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
if numeric_buffersize is None:
- parser.error(u'invalid buffer size specified')
+ parser.error('invalid buffer size specified')
opts.buffersize = numeric_buffersize
if opts.playliststart <= 0:
- raise ValueError(u'Playlist start must be positive')
+ raise ValueError('Playlist start must be positive')
if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart:
- raise ValueError(u'Playlist end must be greater than playlist start')
+ raise ValueError('Playlist end must be greater than playlist start')
if opts.extractaudio:
if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
- parser.error(u'invalid audio format specified')
+ parser.error('invalid audio format specified')
if opts.audioquality:
opts.audioquality = opts.audioquality.strip('k').strip('K')
if not opts.audioquality.isdigit():
- parser.error(u'invalid audio quality specified')
+ parser.error('invalid audio quality specified')
if opts.recodevideo is not None:
if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:
- parser.error(u'invalid video recode format specified')
+ parser.error('invalid video recode format specified')
+
if opts.date is not None:
date = DateRange.day(opts.date)
else:
date = DateRange(opts.dateafter, opts.datebefore)
- if opts.default_search not in ('auto', 'auto_warning', 'error', 'fixup_error', None) and ':' not in opts.default_search:
- parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')
# Do not download videos when there are audio-only formats
if opts.extractaudio and not opts.keepvideo and opts.format is None:
@@ -264,28 +178,66 @@ def _real_main(argv=None):
# --all-sub automatically sets --write-sub if --write-auto-sub is not given
# this was the old behaviour if only --all-sub was given.
- if opts.allsubtitles and (opts.writeautomaticsub == False):
+ if opts.allsubtitles and not opts.writeautomaticsub:
opts.writesubtitles = True
if sys.version_info < (3,):
# In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)
if opts.outtmpl is not None:
opts.outtmpl = opts.outtmpl.decode(preferredencoding())
- outtmpl =((opts.outtmpl is not None and opts.outtmpl)
- or (opts.format == '-1' and opts.usetitle and u'%(title)s-%(id)s-%(format)s.%(ext)s')
- or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
- or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
- or (opts.usetitle and u'%(title)s-%(id)s.%(ext)s')
- or (opts.useid and u'%(id)s.%(ext)s')
- or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
- or DEFAULT_OUTTMPL)
+ outtmpl = ((opts.outtmpl is not None and opts.outtmpl)
+ or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s')
+ or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s')
+ or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s')
+ or (opts.usetitle and '%(title)s-%(id)s.%(ext)s')
+ or (opts.useid and '%(id)s.%(ext)s')
+ or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s')
+ or DEFAULT_OUTTMPL)
if not os.path.splitext(outtmpl)[1] and opts.extractaudio:
- parser.error(u'Cannot download a video and extract audio into the same'
- u' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
- u' template'.format(outtmpl))
-
- any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson
- download_archive_fn = os.path.expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive
+ parser.error('Cannot download a video and extract audio into the same'
+ ' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
+ ' template'.format(outtmpl))
+
+ any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
+ any_printing = opts.print_json
+ download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive
+
+ # PostProcessors
+ postprocessors = []
+ # Add the metadata pp first, the other pps will copy it
+ if opts.addmetadata:
+ postprocessors.append({'key': 'FFmpegMetadata'})
+ if opts.extractaudio:
+ postprocessors.append({
+ 'key': 'FFmpegExtractAudio',
+ 'preferredcodec': opts.audioformat,
+ 'preferredquality': opts.audioquality,
+ 'nopostoverwrites': opts.nopostoverwrites,
+ })
+ if opts.recodevideo:
+ postprocessors.append({
+ 'key': 'FFmpegVideoConvertor',
+ 'preferedformat': opts.recodevideo,
+ })
+ if opts.embedsubtitles:
+ postprocessors.append({
+ 'key': 'FFmpegEmbedSubtitle',
+ 'subtitlesformat': opts.subtitlesformat,
+ })
+ if opts.xattrs:
+ postprocessors.append({'key': 'XAttrMetadata'})
+ if opts.embedthumbnail:
+ if not opts.addmetadata:
+ postprocessors.append({'key': 'FFmpegAudioFix'})
+ postprocessors.append({'key': 'AtomicParsley'})
+ # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
+ # So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
+ if opts.exec_cmd:
+ postprocessors.append({
+ 'key': 'ExecAfterDownload',
+ 'verboseOutput': opts.verbose,
+ 'exec_cmd': opts.exec_cmd,
+ })
ydl_opts = {
'usenetrc': opts.usenetrc,
@@ -293,7 +245,7 @@ def _real_main(argv=None):
'password': opts.password,
'twofactor': opts.twofactor,
'videopassword': opts.videopassword,
- 'quiet': (opts.quiet or any_printing),
+ 'quiet': (opts.quiet or any_getting or any_printing),
'no_warnings': opts.no_warnings,
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
@@ -303,9 +255,10 @@ def _real_main(argv=None):
'forceduration': opts.getduration,
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
- 'forcejson': opts.dumpjson,
- 'simulate': opts.simulate,
- 'skip_download': (opts.skip_download or opts.simulate or any_printing),
+ 'forcejson': opts.dumpjson or opts.print_json,
+ 'dump_single_json': opts.dump_single_json,
+ 'simulate': opts.simulate or any_getting,
+ 'skip_download': opts.skip_download,
'format': opts.format,
'format_limit': opts.format_limit,
'listformats': opts.listformats,
@@ -323,6 +276,7 @@ def _real_main(argv=None):
'progress_with_newline': opts.progress_with_newline,
'playliststart': opts.playliststart,
'playlistend': opts.playlistend,
+ 'playlistreverse': opts.playlist_reverse,
'noplaylist': opts.noplaylist,
'logtostderr': opts.outtmpl == '-',
'consoletitle': opts.consoletitle,
@@ -332,6 +286,7 @@ def _real_main(argv=None):
'writeannotations': opts.writeannotations,
'writeinfojson': opts.writeinfojson,
'writethumbnail': opts.writethumbnail,
+ 'write_all_thumbnails': opts.write_all_thumbnails,
'writesubtitles': opts.writesubtitles,
'writeautomaticsub': opts.writeautomaticsub,
'allsubtitles': opts.allsubtitles,
@@ -369,36 +324,18 @@ def _real_main(argv=None):
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
'encoding': opts.encoding,
'exec_cmd': opts.exec_cmd,
+ 'extract_flat': opts.extract_flat,
+ 'merge_output_format': opts.merge_output_format,
+ 'postprocessors': postprocessors,
+ 'fixup': opts.fixup,
+ 'source_address': opts.source_address,
+ 'call_home': opts.call_home,
+ 'sleep_interval': opts.sleep_interval,
+ 'external_downloader': opts.external_downloader,
+ 'list_thumbnails': opts.list_thumbnails,
}
with YoutubeDL(ydl_opts) as ydl:
- ydl.print_debug_header()
- ydl.add_default_info_extractors()
-
- # PostProcessors
- # Add the metadata pp first, the other pps will copy it
- if opts.addmetadata:
- ydl.add_post_processor(FFmpegMetadataPP())
- if opts.extractaudio:
- ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
- if opts.recodevideo:
- ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
- if opts.embedsubtitles:
- ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
- if opts.xattrs:
- ydl.add_post_processor(XAttrMetadataPP())
- if opts.embedthumbnail:
- if not opts.addmetadata:
- ydl.add_post_processor(FFmpegAudioFixPP())
- ydl.add_post_processor(AtomicParsleyPP())
-
-
- # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
- # So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
- if opts.exec_cmd:
- ydl.add_post_processor(ExecAfterDownloadPP(
- verboseOutput=opts.verbose, exec_cmd=opts.exec_cmd))
-
# Update version
if opts.update_self:
update_self(ydl.to_screen, opts.verbose)
@@ -409,18 +346,19 @@ def _real_main(argv=None):
# Maybe do nothing
if (len(all_urls) < 1) and (opts.load_info_filename is None):
- if not (opts.update_self or opts.rm_cachedir):
- parser.error(u'you must provide at least one URL')
- else:
+ if opts.update_self or opts.rm_cachedir:
sys.exit()
+ ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv)
+ parser.error('you must provide at least one URL')
+
try:
if opts.load_info_filename is not None:
retcode = ydl.download_with_info_file(opts.load_info_filename)
else:
retcode = ydl.download(all_urls)
except MaxDownloadsReached:
- ydl.to_screen(u'--max-download limit reached, aborting.')
+ ydl.to_screen('--max-download limit reached, aborting.')
retcode = 101
sys.exit(retcode)
@@ -432,6 +370,8 @@ def main(argv=None):
except DownloadError:
sys.exit(1)
except SameFileError:
- sys.exit(u'ERROR: fixed output name but more than one file to download')
+ sys.exit('ERROR: fixed output name but more than one file to download')
except KeyboardInterrupt:
- sys.exit(u'\nERROR: Interrupted by user')
+ sys.exit('\nERROR: Interrupted by user')
+
+__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors']
diff --git a/youtube_dl/__main__.py b/youtube_dl/__main__.py
index 3fe29c91f..65a0f891c 100755
--- a/youtube_dl/__main__.py
+++ b/youtube_dl/__main__.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
# Execute with
# $ python youtube_dl/__main__.py (2.6+)
diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py
index e9c5e2152..5efd0f836 100644
--- a/youtube_dl/aes.py
+++ b/youtube_dl/aes.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text']
import base64
@@ -7,10 +9,11 @@ from .utils import bytes_to_intlist, intlist_to_bytes
BLOCK_SIZE_BYTES = 16
+
def aes_ctr_decrypt(data, key, counter):
"""
Decrypt with aes in counter mode
-
+
@param {int[]} data cipher
@param {int[]} key 16/24/32-Byte cipher key
@param {instance} counter Instance whose next_value function (@returns {int[]} 16-Byte block)
@@ -19,23 +22,24 @@ def aes_ctr_decrypt(data, key, counter):
"""
expanded_key = key_expansion(key)
block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
-
- decrypted_data=[]
+
+ decrypted_data = []
for i in range(block_count):
counter_block = counter.next_value()
- block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]
- block += [0]*(BLOCK_SIZE_BYTES - len(block))
-
+ block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ block += [0] * (BLOCK_SIZE_BYTES - len(block))
+
cipher_counter_block = aes_encrypt(counter_block, expanded_key)
decrypted_data += xor(block, cipher_counter_block)
decrypted_data = decrypted_data[:len(data)]
-
+
return decrypted_data
+
def aes_cbc_decrypt(data, key, iv):
"""
Decrypt with aes in CBC mode
-
+
@param {int[]} data cipher
@param {int[]} key 16/24/32-Byte cipher key
@param {int[]} iv 16-Byte IV
@@ -43,94 +47,98 @@ def aes_cbc_decrypt(data, key, iv):
"""
expanded_key = key_expansion(key)
block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
-
- decrypted_data=[]
+
+ decrypted_data = []
previous_cipher_block = iv
for i in range(block_count):
- block = data[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES]
- block += [0]*(BLOCK_SIZE_BYTES - len(block))
-
+ block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ block += [0] * (BLOCK_SIZE_BYTES - len(block))
+
decrypted_block = aes_decrypt(block, expanded_key)
decrypted_data += xor(decrypted_block, previous_cipher_block)
previous_cipher_block = block
decrypted_data = decrypted_data[:len(data)]
-
+
return decrypted_data
+
def key_expansion(data):
"""
Generate key schedule
-
+
@param {int[]} data 16/24/32-Byte cipher key
- @returns {int[]} 176/208/240-Byte expanded key
+ @returns {int[]} 176/208/240-Byte expanded key
"""
- data = data[:] # copy
+ data = data[:] # copy
rcon_iteration = 1
key_size_bytes = len(data)
expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES
-
+
while len(data) < expanded_key_size_bytes:
temp = data[-4:]
temp = key_schedule_core(temp, rcon_iteration)
rcon_iteration += 1
- data += xor(temp, data[-key_size_bytes : 4-key_size_bytes])
-
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
for _ in range(3):
temp = data[-4:]
- data += xor(temp, data[-key_size_bytes : 4-key_size_bytes])
-
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
if key_size_bytes == 32:
temp = data[-4:]
temp = sub_bytes(temp)
- data += xor(temp, data[-key_size_bytes : 4-key_size_bytes])
-
- for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0):
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+ for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0):
temp = data[-4:]
- data += xor(temp, data[-key_size_bytes : 4-key_size_bytes])
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
data = data[:expanded_key_size_bytes]
-
+
return data
+
def aes_encrypt(data, expanded_key):
"""
Encrypt one block with aes
-
+
@param {int[]} data 16-Byte state
- @param {int[]} expanded_key 176/208/240-Byte expanded key
+ @param {int[]} expanded_key 176/208/240-Byte expanded key
@returns {int[]} 16-Byte cipher
"""
rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
- for i in range(1, rounds+1):
+ for i in range(1, rounds + 1):
data = sub_bytes(data)
data = shift_rows(data)
if i != rounds:
data = mix_columns(data)
- data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES])
+ data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES])
return data
+
def aes_decrypt(data, expanded_key):
"""
Decrypt one block with aes
-
+
@param {int[]} data 16-Byte cipher
@param {int[]} expanded_key 176/208/240-Byte expanded key
@returns {int[]} 16-Byte state
"""
rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
-
+
for i in range(rounds, 0, -1):
- data = xor(data, expanded_key[i*BLOCK_SIZE_BYTES : (i+1)*BLOCK_SIZE_BYTES])
+ data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES])
if i != rounds:
data = mix_columns_inv(data)
data = shift_rows_inv(data)
data = sub_bytes_inv(data)
data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
-
+
return data
+
def aes_decrypt_text(data, password, key_size_bytes):
"""
Decrypt text
@@ -138,33 +146,34 @@ def aes_decrypt_text(data, password, key_size_bytes):
- The cipher key is retrieved by encrypting the first 16 Byte of 'password'
with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's)
- Mode of operation is 'counter'
-
+
@param {str} data Base64 encoded string
@param {str,unicode} password Password (will be encoded with utf-8)
@param {int} key_size_bytes Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit
@returns {str} Decrypted data
"""
NONCE_LENGTH_BYTES = 8
-
+
data = bytes_to_intlist(base64.b64decode(data))
password = bytes_to_intlist(password.encode('utf-8'))
-
- key = password[:key_size_bytes] + [0]*(key_size_bytes - len(password))
+
+ key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES)
-
+
nonce = data[:NONCE_LENGTH_BYTES]
cipher = data[NONCE_LENGTH_BYTES:]
-
+
class Counter:
- __value = nonce + [0]*(BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)
+ __value = nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)
+
def next_value(self):
temp = self.__value
self.__value = inc(self.__value)
return temp
-
+
decrypted_data = aes_ctr_decrypt(cipher, key, Counter())
plaintext = intlist_to_bytes(decrypted_data)
-
+
return plaintext
RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36)
@@ -200,14 +209,14 @@ SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x
0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d)
-MIX_COLUMN_MATRIX = ((0x2,0x3,0x1,0x1),
- (0x1,0x2,0x3,0x1),
- (0x1,0x1,0x2,0x3),
- (0x3,0x1,0x1,0x2))
-MIX_COLUMN_MATRIX_INV = ((0xE,0xB,0xD,0x9),
- (0x9,0xE,0xB,0xD),
- (0xD,0x9,0xE,0xB),
- (0xB,0xD,0x9,0xE))
+MIX_COLUMN_MATRIX = ((0x2, 0x3, 0x1, 0x1),
+ (0x1, 0x2, 0x3, 0x1),
+ (0x1, 0x1, 0x2, 0x3),
+ (0x3, 0x1, 0x1, 0x2))
+MIX_COLUMN_MATRIX_INV = ((0xE, 0xB, 0xD, 0x9),
+ (0x9, 0xE, 0xB, 0xD),
+ (0xD, 0x9, 0xE, 0xB),
+ (0xB, 0xD, 0x9, 0xE))
RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35,
0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA,
0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31,
@@ -241,30 +250,37 @@ RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7
0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07)
+
def sub_bytes(data):
return [SBOX[x] for x in data]
+
def sub_bytes_inv(data):
return [SBOX_INV[x] for x in data]
+
def rotate(data):
return data[1:] + [data[0]]
+
def key_schedule_core(data, rcon_iteration):
data = rotate(data)
data = sub_bytes(data)
data[0] = data[0] ^ RCON[rcon_iteration]
-
+
return data
+
def xor(data1, data2):
- return [x^y for x, y in zip(data1, data2)]
+ return [x ^ y for x, y in zip(data1, data2)]
+
def rijndael_mul(a, b):
- if(a==0 or b==0):
+ if(a == 0 or b == 0):
return 0
return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF]
+
def mix_column(data, matrix):
data_mixed = []
for row in range(4):
@@ -275,33 +291,38 @@ def mix_column(data, matrix):
data_mixed.append(mixed)
return data_mixed
+
def mix_columns(data, matrix=MIX_COLUMN_MATRIX):
data_mixed = []
for i in range(4):
- column = data[i*4 : (i+1)*4]
+ column = data[i * 4: (i + 1) * 4]
data_mixed += mix_column(column, matrix)
return data_mixed
+
def mix_columns_inv(data):
return mix_columns(data, MIX_COLUMN_MATRIX_INV)
+
def shift_rows(data):
data_shifted = []
for column in range(4):
for row in range(4):
- data_shifted.append( data[((column + row) & 0b11) * 4 + row] )
+ data_shifted.append(data[((column + row) & 0b11) * 4 + row])
return data_shifted
+
def shift_rows_inv(data):
data_shifted = []
for column in range(4):
for row in range(4):
- data_shifted.append( data[((column - row) & 0b11) * 4 + row] )
+ data_shifted.append(data[((column - row) & 0b11) * 4 + row])
return data_shifted
+
def inc(data):
- data = data[:] # copy
- for i in range(len(data)-1,-1,-1):
+ data = data[:] # copy
+ for i in range(len(data) - 1, -1, -1):
if data[i] == 255:
data[i] = 0
else:
diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py
index 79ff09f78..5fe839eb1 100644
--- a/youtube_dl/cache.py
+++ b/youtube_dl/cache.py
@@ -8,9 +8,8 @@ import re
import shutil
import traceback
-from .utils import (
- write_json_file,
-)
+from .compat import compat_expanduser, compat_getenv
+from .utils import write_json_file
class Cache(object):
@@ -20,9 +19,9 @@ class Cache(object):
def _get_root_dir(self):
res = self._ydl.params.get('cachedir')
if res is None:
- cache_root = os.environ.get('XDG_CACHE_HOME', '~/.cache')
+ cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache')
res = os.path.join(cache_root, 'youtube-dl')
- return os.path.expanduser(res)
+ return compat_expanduser(res)
def _get_cache_fn(self, section, key, dtype):
assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
new file mode 100644
index 000000000..4453b34fc
--- /dev/null
+++ b/youtube_dl/compat.py
@@ -0,0 +1,386 @@
+from __future__ import unicode_literals
+
+import getpass
+import optparse
+import os
+import re
+import socket
+import subprocess
+import sys
+
+
+try:
+ import urllib.request as compat_urllib_request
+except ImportError: # Python 2
+ import urllib2 as compat_urllib_request
+
+try:
+ import urllib.error as compat_urllib_error
+except ImportError: # Python 2
+ import urllib2 as compat_urllib_error
+
+try:
+ import urllib.parse as compat_urllib_parse
+except ImportError: # Python 2
+ import urllib as compat_urllib_parse
+
+try:
+ from urllib.parse import urlparse as compat_urllib_parse_urlparse
+except ImportError: # Python 2
+ from urlparse import urlparse as compat_urllib_parse_urlparse
+
+try:
+ import urllib.parse as compat_urlparse
+except ImportError: # Python 2
+ import urlparse as compat_urlparse
+
+try:
+ import http.cookiejar as compat_cookiejar
+except ImportError: # Python 2
+ import cookielib as compat_cookiejar
+
+try:
+ import html.entities as compat_html_entities
+except ImportError: # Python 2
+ import htmlentitydefs as compat_html_entities
+
+try:
+ import html.parser as compat_html_parser
+except ImportError: # Python 2
+ import HTMLParser as compat_html_parser
+
+try:
+ import http.client as compat_http_client
+except ImportError: # Python 2
+ import httplib as compat_http_client
+
+try:
+ from urllib.error import HTTPError as compat_HTTPError
+except ImportError: # Python 2
+ from urllib2 import HTTPError as compat_HTTPError
+
+try:
+ from urllib.request import urlretrieve as compat_urlretrieve
+except ImportError: # Python 2
+ from urllib import urlretrieve as compat_urlretrieve
+
+
+try:
+ from subprocess import DEVNULL
+ compat_subprocess_get_DEVNULL = lambda: DEVNULL
+except ImportError:
+ compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
+
+try:
+ from urllib.parse import unquote as compat_urllib_parse_unquote
+except ImportError:
+ def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
+ if string == '':
+ return string
+ res = string.split('%')
+ if len(res) == 1:
+ return string
+ if encoding is None:
+ encoding = 'utf-8'
+ if errors is None:
+ errors = 'replace'
+ # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
+ pct_sequence = b''
+ string = res[0]
+ for item in res[1:]:
+ try:
+ if not item:
+ raise ValueError
+ pct_sequence += item[:2].decode('hex')
+ rest = item[2:]
+ if not rest:
+ # This segment was just a single percent-encoded character.
+ # May be part of a sequence of code units, so delay decoding.
+ # (Stored in pct_sequence).
+ continue
+ except ValueError:
+ rest = '%' + item
+ # Encountered non-percent-encoded characters. Flush the current
+ # pct_sequence.
+ string += pct_sequence.decode(encoding, errors) + rest
+ pct_sequence = b''
+ if pct_sequence:
+ # Flush the final pct_sequence
+ string += pct_sequence.decode(encoding, errors)
+ return string
+
+
+try:
+ from urllib.parse import parse_qs as compat_parse_qs
+except ImportError: # Python 2
+ # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
+ # Python 2's version is apparently totally broken
+
+ def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
+ encoding='utf-8', errors='replace'):
+ qs, _coerce_result = qs, unicode
+ pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
+ r = []
+ for name_value in pairs:
+ if not name_value and not strict_parsing:
+ continue
+ nv = name_value.split('=', 1)
+ if len(nv) != 2:
+ if strict_parsing:
+ raise ValueError("bad query field: %r" % (name_value,))
+ # Handle case of a control-name with no equal sign
+ if keep_blank_values:
+ nv.append('')
+ else:
+ continue
+ if len(nv[1]) or keep_blank_values:
+ name = nv[0].replace('+', ' ')
+ name = compat_urllib_parse_unquote(
+ name, encoding=encoding, errors=errors)
+ name = _coerce_result(name)
+ value = nv[1].replace('+', ' ')
+ value = compat_urllib_parse_unquote(
+ value, encoding=encoding, errors=errors)
+ value = _coerce_result(value)
+ r.append((name, value))
+ return r
+
+ def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
+ encoding='utf-8', errors='replace'):
+ parsed_result = {}
+ pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
+ encoding=encoding, errors=errors)
+ for name, value in pairs:
+ if name in parsed_result:
+ parsed_result[name].append(value)
+ else:
+ parsed_result[name] = [value]
+ return parsed_result
+
+try:
+ compat_str = unicode # Python 2
+except NameError:
+ compat_str = str
+
+try:
+ compat_chr = unichr # Python 2
+except NameError:
+ compat_chr = chr
+
+try:
+ from xml.etree.ElementTree import ParseError as compat_xml_parse_error
+except ImportError: # Python 2.6
+ from xml.parsers.expat import ExpatError as compat_xml_parse_error
+
+try:
+ from shlex import quote as shlex_quote
+except ImportError: # Python < 3.3
+ def shlex_quote(s):
+ if re.match(r'^[-_\w./]+$', s):
+ return s
+ else:
+ return "'" + s.replace("'", "'\"'\"'") + "'"
+
+
+def compat_ord(c):
+ if type(c) is int:
+ return c
+ else:
+ return ord(c)
+
+
+if sys.version_info >= (3, 0):
+ compat_getenv = os.getenv
+ compat_expanduser = os.path.expanduser
+else:
+ # Environment variables should be decoded with filesystem encoding.
+ # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
+
+ def compat_getenv(key, default=None):
+ from .utils import get_filesystem_encoding
+ env = os.getenv(key, default)
+ if env:
+ env = env.decode(get_filesystem_encoding())
+ return env
+
+ # HACK: The default implementations of os.path.expanduser from cpython do not decode
+ # environment variables with filesystem encoding. We will work around this by
+ # providing adjusted implementations.
+ # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
+ # for different platforms with correct environment variables decoding.
+
+ if os.name == 'posix':
+ def compat_expanduser(path):
+ """Expand ~ and ~user constructions. If user or $HOME is unknown,
+ do nothing."""
+ if not path.startswith('~'):
+ return path
+ i = path.find('/', 1)
+ if i < 0:
+ i = len(path)
+ if i == 1:
+ if 'HOME' not in os.environ:
+ import pwd
+ userhome = pwd.getpwuid(os.getuid()).pw_dir
+ else:
+ userhome = compat_getenv('HOME')
+ else:
+ import pwd
+ try:
+ pwent = pwd.getpwnam(path[1:i])
+ except KeyError:
+ return path
+ userhome = pwent.pw_dir
+ userhome = userhome.rstrip('/')
+ return (userhome + path[i:]) or '/'
+ elif os.name == 'nt' or os.name == 'ce':
+ def compat_expanduser(path):
+ """Expand ~ and ~user constructs.
+
+ If user or $HOME is unknown, do nothing."""
+ if path[:1] != '~':
+ return path
+ i, n = 1, len(path)
+ while i < n and path[i] not in '/\\':
+ i = i + 1
+
+ if 'HOME' in os.environ:
+ userhome = compat_getenv('HOME')
+ elif 'USERPROFILE' in os.environ:
+ userhome = compat_getenv('USERPROFILE')
+ elif 'HOMEPATH' not in os.environ:
+ return path
+ else:
+ try:
+ drive = compat_getenv('HOMEDRIVE')
+ except KeyError:
+ drive = ''
+ userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
+
+ if i != 1: # ~user
+ userhome = os.path.join(os.path.dirname(userhome), path[1:i])
+
+ return userhome + path[i:]
+ else:
+ compat_expanduser = os.path.expanduser
+
+
+if sys.version_info < (3, 0):
+ def compat_print(s):
+ from .utils import preferredencoding
+ print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
+else:
+ def compat_print(s):
+ assert isinstance(s, compat_str)
+ print(s)
+
+
+try:
+ subprocess_check_output = subprocess.check_output
+except AttributeError:
+ def subprocess_check_output(*args, **kwargs):
+ assert 'input' not in kwargs
+ p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
+ output, _ = p.communicate()
+ ret = p.poll()
+ if ret:
+ raise subprocess.CalledProcessError(ret, p.args, output=output)
+ return output
+
+if sys.version_info < (3, 0) and sys.platform == 'win32':
+ def compat_getpass(prompt, *args, **kwargs):
+ if isinstance(prompt, compat_str):
+ from .utils import preferredencoding
+ prompt = prompt.encode(preferredencoding())
+ return getpass.getpass(prompt, *args, **kwargs)
+else:
+ compat_getpass = getpass.getpass
+
+# Old 2.6 and 2.7 releases require kwargs to be bytes
+try:
+ def _testfunc(x):
+ pass
+ _testfunc(**{'x': 0})
+except TypeError:
+ def compat_kwargs(kwargs):
+ return dict((bytes(k), v) for k, v in kwargs.items())
+else:
+ compat_kwargs = lambda kwargs: kwargs
+
+
+if sys.version_info < (2, 7):
+ def compat_socket_create_connection(address, timeout, source_address=None):
+ host, port = address
+ err = None
+ for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
+ af, socktype, proto, canonname, sa = res
+ sock = None
+ try:
+ sock = socket.socket(af, socktype, proto)
+ sock.settimeout(timeout)
+ if source_address:
+ sock.bind(source_address)
+ sock.connect(sa)
+ return sock
+ except socket.error as _:
+ err = _
+ if sock is not None:
+ sock.close()
+ if err is not None:
+ raise err
+ else:
+ raise socket.error("getaddrinfo returns an empty list")
+else:
+ compat_socket_create_connection = socket.create_connection
+
+
+# Fix https://github.com/rg3/youtube-dl/issues/4223
+# See http://bugs.python.org/issue9161 for what is broken
+def workaround_optparse_bug9161():
+ op = optparse.OptionParser()
+ og = optparse.OptionGroup(op, 'foo')
+ try:
+ og.add_option('-t')
+ except TypeError:
+ real_add_option = optparse.OptionGroup.add_option
+
+ def _compat_add_option(self, *args, **kwargs):
+ enc = lambda v: (
+ v.encode('ascii', 'replace') if isinstance(v, compat_str)
+ else v)
+ bargs = [enc(a) for a in args]
+ bkwargs = dict(
+ (k, enc(v)) for k, v in kwargs.items())
+ return real_add_option(self, *bargs, **bkwargs)
+ optparse.OptionGroup.add_option = _compat_add_option
+
+
+__all__ = [
+ 'compat_HTTPError',
+ 'compat_chr',
+ 'compat_cookiejar',
+ 'compat_expanduser',
+ 'compat_getenv',
+ 'compat_getpass',
+ 'compat_html_entities',
+ 'compat_html_parser',
+ 'compat_http_client',
+ 'compat_kwargs',
+ 'compat_ord',
+ 'compat_parse_qs',
+ 'compat_print',
+ 'compat_socket_create_connection',
+ 'compat_str',
+ 'compat_subprocess_get_DEVNULL',
+ 'compat_urllib_error',
+ 'compat_urllib_parse',
+ 'compat_urllib_parse_unquote',
+ 'compat_urllib_parse_urlparse',
+ 'compat_urllib_request',
+ 'compat_urlparse',
+ 'compat_urlretrieve',
+ 'compat_xml_parse_error',
+ 'shlex_quote',
+ 'subprocess_check_output',
+ 'workaround_optparse_bug9161',
+]
diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py
index 3f941596e..eff1122c5 100644
--- a/youtube_dl/downloader/__init__.py
+++ b/youtube_dl/downloader/__init__.py
@@ -1,32 +1,43 @@
from __future__ import unicode_literals
from .common import FileDownloader
+from .external import get_external_downloader
+from .f4m import F4mFD
from .hls import HlsFD
from .hls import NativeHlsFD
from .http import HttpFD
from .mplayer import MplayerFD
from .rtmp import RtmpFD
-from .f4m import F4mFD
from ..utils import (
- determine_ext,
+ determine_protocol,
)
+PROTOCOL_MAP = {
+ 'rtmp': RtmpFD,
+ 'm3u8_native': NativeHlsFD,
+ 'm3u8': HlsFD,
+ 'mms': MplayerFD,
+ 'rtsp': MplayerFD,
+ 'f4m': F4mFD,
+}
-def get_suitable_downloader(info_dict):
+
+def get_suitable_downloader(info_dict, params={}):
"""Get the downloader class that can handle the info dict."""
- url = info_dict['url']
- protocol = info_dict.get('protocol')
-
- if url.startswith('rtmp'):
- return RtmpFD
- if protocol == 'm3u8_native':
- return NativeHlsFD
- if (protocol == 'm3u8') or (protocol is None and determine_ext(url) == 'm3u8'):
- return HlsFD
- if url.startswith('mms') or url.startswith('rtsp'):
- return MplayerFD
- if determine_ext(url) == 'f4m':
- return F4mFD
- else:
- return HttpFD
+ protocol = determine_protocol(info_dict)
+ info_dict['protocol'] = protocol
+
+ external_downloader = params.get('external_downloader')
+ if external_downloader is not None:
+ ed = get_external_downloader(external_downloader)
+ if ed.supports(info_dict):
+ return ed
+
+ return PROTOCOL_MAP.get(protocol, HttpFD)
+
+
+__all__ = [
+ 'get_suitable_downloader',
+ 'FileDownloader',
+]
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index f85f0c94e..c35c42c1d 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -1,10 +1,12 @@
+from __future__ import unicode_literals
+
import os
import re
import sys
import time
+from ..compat import compat_str
from ..utils import (
- compat_str,
encodeFilename,
format_bytes,
timeconvert,
@@ -78,8 +80,10 @@ class FileDownloader(object):
def calc_eta(start, now, total, current):
if total is None:
return None
+ if now is None:
+ now = time.time()
dif = now - start
- if current == 0 or dif < 0.001: # One millisecond
+ if current == 0 or dif < 0.001: # One millisecond
return None
rate = float(current) / dif
return int((float(total) - float(current)) / rate)
@@ -93,7 +97,7 @@ class FileDownloader(object):
@staticmethod
def calc_speed(start, now, bytes):
dif = now - start
- if bytes == 0 or dif < 0.001: # One millisecond
+ if bytes == 0 or dif < 0.001: # One millisecond
return None
return float(bytes) / dif
@@ -106,7 +110,7 @@ class FileDownloader(object):
@staticmethod
def best_block_size(elapsed_time, bytes):
new_min = max(bytes / 2.0, 1.0)
- new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
+ new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
if elapsed_time < 0.001:
return int(new_max)
rate = bytes / elapsed_time
@@ -144,29 +148,30 @@ class FileDownloader(object):
def report_error(self, *args, **kargs):
self.ydl.report_error(*args, **kargs)
- def slow_down(self, start_time, byte_counter):
+ def slow_down(self, start_time, now, byte_counter):
"""Sleep if the download speed is over the rate limit."""
rate_limit = self.params.get('ratelimit', None)
if rate_limit is None or byte_counter == 0:
return
- now = time.time()
+ if now is None:
+ now = time.time()
elapsed = now - start_time
if elapsed <= 0.0:
return
speed = float(byte_counter) / elapsed
if speed > rate_limit:
- time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
+ time.sleep(max((byte_counter // rate_limit) - elapsed, 0))
def temp_name(self, filename):
"""Returns a temporary filename for the given filename."""
- if self.params.get('nopart', False) or filename == u'-' or \
+ if self.params.get('nopart', False) or filename == '-' or \
(os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))):
return filename
- return filename + u'.part'
+ return filename + '.part'
def undo_temp_name(self, filename):
- if filename.endswith(u'.part'):
- return filename[:-len(u'.part')]
+ if filename.endswith('.part'):
+ return filename[:-len('.part')]
return filename
def try_rename(self, old_filename, new_filename):
@@ -175,7 +180,7 @@ class FileDownloader(object):
return
os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
except (IOError, OSError) as err:
- self.report_error(u'unable to rename file: %s' % compat_str(err))
+ self.report_error('unable to rename file: %s' % compat_str(err))
def try_utime(self, filename, last_modified_hdr):
"""Try to set the last-modified time of the given file."""
@@ -200,10 +205,10 @@ class FileDownloader(object):
def report_destination(self, filename):
"""Report destination filename."""
- self.to_screen(u'[download] Destination: ' + filename)
+ self.to_screen('[download] Destination: ' + filename)
def _report_progress_status(self, msg, is_last_line=False):
- fullmsg = u'[download] ' + msg
+ fullmsg = '[download] ' + msg
if self.params.get('progress_with_newline', False):
self.to_screen(fullmsg)
else:
@@ -211,13 +216,13 @@ class FileDownloader(object):
prev_len = getattr(self, '_report_progress_prev_line_length',
0)
if prev_len > len(fullmsg):
- fullmsg += u' ' * (prev_len - len(fullmsg))
+ fullmsg += ' ' * (prev_len - len(fullmsg))
self._report_progress_prev_line_length = len(fullmsg)
- clear_line = u'\r'
+ clear_line = '\r'
else:
- clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r')
+ clear_line = ('\r\x1b[K' if sys.stderr.isatty() else '\r')
self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line)
- self.to_console_title(u'youtube-dl ' + msg)
+ self.to_console_title('youtube-dl ' + msg)
def report_progress(self, percent, data_len_str, speed, eta):
"""Report download progress."""
@@ -233,7 +238,7 @@ class FileDownloader(object):
percent_str = 'Unknown %'
speed_str = self.format_speed(speed)
- msg = (u'%s of %s at %s ETA %s' %
+ msg = ('%s of %s at %s ETA %s' %
(percent_str, data_len_str, speed_str, eta_str))
self._report_progress_status(msg)
@@ -243,44 +248,56 @@ class FileDownloader(object):
downloaded_str = format_bytes(downloaded_data_len)
speed_str = self.format_speed(speed)
elapsed_str = FileDownloader.format_seconds(elapsed)
- msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str)
+ msg = '%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str)
self._report_progress_status(msg)
def report_finish(self, data_len_str, tot_time):
"""Report download finished."""
if self.params.get('noprogress', False):
- self.to_screen(u'[download] Download completed')
+ self.to_screen('[download] Download completed')
else:
self._report_progress_status(
- (u'100%% of %s in %s' %
+ ('100%% of %s in %s' %
(data_len_str, self.format_seconds(tot_time))),
is_last_line=True)
def report_resuming_byte(self, resume_len):
"""Report attempt to resume at given byte."""
- self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
+ self.to_screen('[download] Resuming download at byte %s' % resume_len)
def report_retry(self, count, retries):
"""Report retry in case of HTTP error 5xx"""
- self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
+ self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
try:
- self.to_screen(u'[download] %s has already been downloaded' % file_name)
+ self.to_screen('[download] %s has already been downloaded' % file_name)
except UnicodeEncodeError:
- self.to_screen(u'[download] The file has already been downloaded')
+ self.to_screen('[download] The file has already been downloaded')
def report_unable_to_resume(self):
"""Report it was impossible to resume download."""
- self.to_screen(u'[download] Unable to resume')
+ self.to_screen('[download] Unable to resume')
def download(self, filename, info_dict):
"""Download to a filename using the info from info_dict
Return True on success and False otherwise
"""
+
+ nooverwrites_and_exists = (
+ self.params.get('nooverwrites', False)
+ and os.path.exists(encodeFilename(filename))
+ )
+
+ continuedl_and_exists = (
+ self.params.get('continuedl', False)
+ and os.path.isfile(encodeFilename(filename))
+ and not self.params.get('nopart', False)
+ )
+
# Check file already present
- if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False):
+ if filename != '-' and nooverwrites_and_exists or continuedl_and_exists:
self.report_file_already_downloaded(filename)
self._hook_progress({
'filename': filename,
@@ -289,30 +306,43 @@ class FileDownloader(object):
})
return True
+ sleep_interval = self.params.get('sleep_interval')
+ if sleep_interval:
+ self.to_screen('[download] Sleeping %s seconds...' % sleep_interval)
+ time.sleep(sleep_interval)
+
return self.real_download(filename, info_dict)
def real_download(self, filename, info_dict):
"""Real download process. Redefine in subclasses."""
- raise NotImplementedError(u'This method must be implemented by subclasses')
+ raise NotImplementedError('This method must be implemented by subclasses')
def _hook_progress(self, status):
for ph in self._progress_hooks:
ph(status)
def add_progress_hook(self, ph):
- """ ph gets called on download progress, with a dictionary with the entries
- * filename: The final filename
- * status: One of "downloading" and "finished"
+ # See YoutubeDl.py (search for progress_hooks) for a description of
+ # this interface
+ self._progress_hooks.append(ph)
- It can also have some of the following entries:
+ def _debug_cmd(self, args, subprocess_encoding, exe=None):
+ if not self.params.get('verbose', False):
+ return
- * downloaded_bytes: Bytes on disks
- * total_bytes: Total bytes, None if unknown
- * tmpfilename: The filename we're currently writing to
- * eta: The estimated time in seconds, None if unknown
- * speed: The download speed in bytes/second, None if unknown
+ if exe is None:
+ exe = os.path.basename(args[0])
- Hooks are guaranteed to be called at least once (with status "finished")
- if the download is successful.
- """
- self._progress_hooks.append(ph)
+ if subprocess_encoding:
+ str_args = [
+ a.decode(subprocess_encoding) if isinstance(a, bytes) else a
+ for a in args]
+ else:
+ str_args = args
+ try:
+ import pipes
+ shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
+ except ImportError:
+ shell_quote = repr
+ self.to_screen('[debug] %s command line: %s' % (
+ exe, shell_quote(str_args)))
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py
new file mode 100644
index 000000000..af9fdba75
--- /dev/null
+++ b/youtube_dl/downloader/external.py
@@ -0,0 +1,117 @@
+from __future__ import unicode_literals
+
+import os.path
+import subprocess
+import sys
+
+from .common import FileDownloader
+from ..utils import (
+ encodeFilename,
+)
+
+
+class ExternalFD(FileDownloader):
+ def real_download(self, filename, info_dict):
+ self.report_destination(filename)
+ tmpfilename = self.temp_name(filename)
+
+ retval = self._call_downloader(tmpfilename, info_dict)
+ if retval == 0:
+ fsize = os.path.getsize(encodeFilename(tmpfilename))
+ self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize))
+ self.try_rename(tmpfilename, filename)
+ self._hook_progress({
+ 'downloaded_bytes': fsize,
+ 'total_bytes': fsize,
+ 'filename': filename,
+ 'status': 'finished',
+ })
+ return True
+ else:
+ self.to_stderr('\n')
+ self.report_error('%s exited with code %d' % (
+ self.get_basename(), retval))
+ return False
+
+ @classmethod
+ def get_basename(cls):
+ return cls.__name__[:-2].lower()
+
+ @property
+ def exe(self):
+ return self.params.get('external_downloader')
+
+ @classmethod
+ def supports(cls, info_dict):
+ return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps')
+
+ def _call_downloader(self, tmpfilename, info_dict):
+ """ Either overwrite this or implement _make_cmd """
+ cmd = self._make_cmd(tmpfilename, info_dict)
+
+ if sys.platform == 'win32' and sys.version_info < (3, 0):
+ # Windows subprocess module does not actually support Unicode
+ # on Python 2.x
+ # See http://stackoverflow.com/a/9951851/35070
+ subprocess_encoding = sys.getfilesystemencoding()
+ cmd = [a.encode(subprocess_encoding, 'ignore') for a in cmd]
+ else:
+ subprocess_encoding = None
+ self._debug_cmd(cmd, subprocess_encoding)
+
+ p = subprocess.Popen(
+ cmd, stderr=subprocess.PIPE)
+ _, stderr = p.communicate()
+ if p.returncode != 0:
+ self.to_stderr(stderr)
+ return p.returncode
+
+
+class CurlFD(ExternalFD):
+ def _make_cmd(self, tmpfilename, info_dict):
+ cmd = [self.exe, '-o', tmpfilename]
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['--header', '%s: %s' % (key, val)]
+ cmd += ['--', info_dict['url']]
+ return cmd
+
+
+class WgetFD(ExternalFD):
+ def _make_cmd(self, tmpfilename, info_dict):
+ cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['--header', '%s: %s' % (key, val)]
+ cmd += ['--', info_dict['url']]
+ return cmd
+
+
+class Aria2cFD(ExternalFD):
+ def _make_cmd(self, tmpfilename, info_dict):
+ cmd = [
+ self.exe, '-c',
+ '--min-split-size', '1M', '--max-connection-per-server', '4']
+ dn = os.path.dirname(tmpfilename)
+ if dn:
+ cmd += ['--dir', dn]
+ cmd += ['--out', os.path.basename(tmpfilename)]
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['--header', '%s: %s' % (key, val)]
+ cmd += ['--', info_dict['url']]
+ return cmd
+
+_BY_NAME = dict(
+ (klass.get_basename(), klass)
+ for name, klass in globals().items()
+ if name.endswith('FD') and name != 'ExternalFD'
+)
+
+
+def list_external_downloaders():
+ return sorted(_BY_NAME.keys())
+
+
+def get_external_downloader(external_downloader):
+ """ Given the name of the executable, see whether we support the given
+ downloader . """
+ bn = os.path.basename(external_downloader)
+ return _BY_NAME[bn]
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index b3be16ff1..c68b2c303 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -9,10 +9,12 @@ import xml.etree.ElementTree as etree
from .common import FileDownloader
from .http import HttpFD
+from ..compat import (
+ compat_urlparse,
+)
from ..utils import (
struct_pack,
struct_unpack,
- compat_urlparse,
format_bytes,
encodeFilename,
sanitize_open,
@@ -55,7 +57,7 @@ class FlvReader(io.BytesIO):
if size == 1:
real_size = self.read_unsigned_long_long()
header_end = 16
- return real_size, box_type, self.read(real_size-header_end)
+ return real_size, box_type, self.read(real_size - header_end)
def read_asrt(self):
# version
@@ -175,34 +177,43 @@ def build_fragments_list(boot_info):
""" Return a list of (segment, fragment) for each fragment in the video """
res = []
segment_run_table = boot_info['segments'][0]
- # I've only found videos with one segment
- segment_run_entry = segment_run_table['segment_run'][0]
- n_frags = segment_run_entry[1]
fragment_run_entry_table = boot_info['fragments'][0]['fragments']
first_frag_number = fragment_run_entry_table[0]['first']
- for (i, frag_number) in zip(range(1, n_frags+1), itertools.count(first_frag_number)):
- res.append((1, frag_number))
+ fragments_counter = itertools.count(first_frag_number)
+ for segment, fragments_count in segment_run_table['segment_run']:
+ for _ in range(fragments_count):
+ res.append((segment, next(fragments_counter)))
return res
-def write_flv_header(stream, metadata):
- """Writes the FLV header and the metadata to stream"""
+def write_unsigned_int(stream, val):
+ stream.write(struct_pack('!I', val))
+
+
+def write_unsigned_int_24(stream, val):
+ stream.write(struct_pack('!I', val)[1:])
+
+
+def write_flv_header(stream):
+ """Writes the FLV header to stream"""
# FLV header
stream.write(b'FLV\x01')
stream.write(b'\x05')
stream.write(b'\x00\x00\x00\x09')
- # FLV File body
stream.write(b'\x00\x00\x00\x00')
- # FLVTAG
- # Script data
- stream.write(b'\x12')
- # Size of the metadata with 3 bytes
- stream.write(struct_pack('!L', len(metadata))[1:])
- stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
- stream.write(metadata)
- # Magic numbers extracted from the output files produced by AdobeHDS.php
- #(https://github.com/K-S-V/Scripts)
- stream.write(b'\x00\x00\x01\x73')
+
+
+def write_metadata_tag(stream, metadata):
+ """Writes optional metadata tag to stream"""
+ SCRIPT_TAG = b'\x12'
+ FLV_TAG_HEADER_LEN = 11
+
+ if metadata:
+ stream.write(SCRIPT_TAG)
+ write_unsigned_int_24(stream, len(metadata))
+ stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
+ stream.write(metadata)
+ write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))
def _add_ns(prop):
@@ -225,13 +236,16 @@ class F4mFD(FileDownloader):
self.to_screen('[download] Downloading f4m manifest')
manifest = self.ydl.urlopen(man_url).read()
self.report_destination(filename)
- http_dl = HttpQuietDownloader(self.ydl,
+ http_dl = HttpQuietDownloader(
+ self.ydl,
{
'continuedl': True,
'quiet': True,
'noprogress': True,
+ 'ratelimit': self.params.get('ratelimit', None),
'test': self.params.get('test', False),
- })
+ }
+ )
doc = etree.fromstring(manifest)
formats = [(int(f.attrib.get('bitrate', -1)), f) for f in doc.findall(_add_ns('media'))]
@@ -244,9 +258,20 @@ class F4mFD(FileDownloader):
lambda f: int(f[0]) == requested_bitrate, formats))[0]
base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])
- bootstrap = base64.b64decode(doc.find(_add_ns('bootstrapInfo')).text)
- metadata = base64.b64decode(media.find(_add_ns('metadata')).text)
+ bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
+ if bootstrap_node.text is None:
+ bootstrap_url = compat_urlparse.urljoin(
+ base_url, bootstrap_node.attrib['url'])
+ bootstrap = self.ydl.urlopen(bootstrap_url).read()
+ else:
+ bootstrap = base64.b64decode(bootstrap_node.text)
+ metadata_node = media.find(_add_ns('metadata'))
+ if metadata_node is not None:
+ metadata = base64.b64decode(metadata_node.text)
+ else:
+ metadata = None
boot_info = read_bootstrap_info(bootstrap)
+
fragments_list = build_fragments_list(boot_info)
if self.params.get('test', False):
# We only download the first fragment
@@ -257,7 +282,8 @@ class F4mFD(FileDownloader):
tmpfilename = self.temp_name(filename)
(dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb')
- write_flv_header(dest_stream, metadata)
+ write_flv_header(dest_stream)
+ write_metadata_tag(dest_stream, metadata)
# This dict stores the download progress, it's updated by the progress
# hook
@@ -270,7 +296,7 @@ class F4mFD(FileDownloader):
def frag_progress_hook(status):
frag_total_bytes = status.get('total_bytes', 0)
estimated_size = (state['downloaded_bytes'] +
- (total_frags - state['frag_counter']) * frag_total_bytes)
+ (total_frags - state['frag_counter']) * frag_total_bytes)
if status['status'] == 'finished':
state['downloaded_bytes'] += frag_total_bytes
state['frag_counter'] += 1
@@ -280,13 +306,13 @@ class F4mFD(FileDownloader):
frag_downloaded_bytes = status['downloaded_bytes']
byte_counter = state['downloaded_bytes'] + frag_downloaded_bytes
frag_progress = self.calc_percent(frag_downloaded_bytes,
- frag_total_bytes)
+ frag_total_bytes)
progress = self.calc_percent(state['frag_counter'], total_frags)
progress += frag_progress / float(total_frags)
eta = self.calc_eta(start, time.time(), estimated_size, byte_counter)
self.report_progress(progress, format_bytes(estimated_size),
- status.get('speed'), eta)
+ status.get('speed'), eta)
http_dl.add_progress_hook(frag_progress_hook)
frags_filenames = []
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 68eafa403..aa58b52ab 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -4,11 +4,13 @@ import os
import re
import subprocess
+from ..postprocessor.ffmpeg import FFmpegPostProcessor
from .common import FileDownloader
-from ..utils import (
+from ..compat import (
compat_urlparse,
compat_urllib_request,
- check_executable,
+)
+from ..utils import (
encodeFilename,
)
@@ -24,18 +26,18 @@ class HlsFD(FileDownloader):
'-bsf:a', 'aac_adtstoasc',
encodeFilename(tmpfilename, for_subprocess=True)]
- for program in ['avconv', 'ffmpeg']:
- if check_executable(program, ['-version']):
- break
- else:
- self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found. Please install one.')
+ ffpp = FFmpegPostProcessor(downloader=self)
+ program = ffpp._executable
+ if program is None:
+ self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.')
return False
+ ffpp.check_version()
cmd = [program] + args
retval = subprocess.call(cmd)
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'\r[%s] %s bytes' % (cmd[0], fsize))
+ self.to_screen('\r[%s] %s bytes' % (cmd[0], fsize))
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
@@ -45,8 +47,8 @@ class HlsFD(FileDownloader):
})
return True
else:
- self.to_stderr(u"\n")
- self.report_error(u'%s exited with code %d' % (program, retval))
+ self.to_stderr('\n')
+ self.report_error('%s exited with code %d' % (program, retval))
return False
@@ -101,4 +103,3 @@ class NativeHlsFD(FileDownloader):
})
self.try_rename(tmpfilename, filename)
return True
-
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index f62555ce0..4db50ee90 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -1,12 +1,15 @@
+from __future__ import unicode_literals
+
import os
import time
from .common import FileDownloader
-from ..utils import (
+from ..compat import (
compat_urllib_request,
compat_urllib_error,
+)
+from ..utils import (
ContentTooShortError,
-
encodeFilename,
sanitize_open,
format_bytes,
@@ -21,10 +24,6 @@ class HttpFD(FileDownloader):
# Do not include the Accept-Encoding header
headers = {'Youtubedl-no-compression': 'True'}
- if 'user_agent' in info_dict:
- headers['Youtubedl-user-agent'] = info_dict['user_agent']
- if 'http_referer' in info_dict:
- headers['Referer'] = info_dict['http_referer']
add_headers = info_dict.get('http_headers')
if add_headers:
headers.update(add_headers)
@@ -106,7 +105,7 @@ class HttpFD(FileDownloader):
self.report_retry(count, retries)
if count > retries:
- self.report_error(u'giving up after %s retries' % retries)
+ self.report_error('giving up after %s retries' % retries)
return False
data_len = data.info().get('Content-length', None)
@@ -124,26 +123,31 @@ class HttpFD(FileDownloader):
min_data_len = self.params.get("min_filesize", None)
max_data_len = self.params.get("max_filesize", None)
if min_data_len is not None and data_len < min_data_len:
- self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
+ self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
return False
if max_data_len is not None and data_len > max_data_len:
- self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
+ self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
return False
data_len_str = format_bytes(data_len)
byte_counter = 0 + resume_len
block_size = self.params.get('buffersize', 1024)
start = time.time()
+
+ # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
+ now = None # needed for slow_down() in the first loop run
+ before = start # start measuring
while True:
+
# Download and write
- before = time.time()
data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
- after = time.time()
+ byte_counter += len(data_block)
+
+ # exit loop when download is finished
if len(data_block) == 0:
break
- byte_counter += len(data_block)
- # Open file just in time
+ # Open destination file just in time
if stream is None:
try:
(stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
@@ -151,19 +155,30 @@ class HttpFD(FileDownloader):
filename = self.undo_temp_name(tmpfilename)
self.report_destination(filename)
except (OSError, IOError) as err:
- self.report_error(u'unable to open for writing: %s' % str(err))
+ self.report_error('unable to open for writing: %s' % str(err))
return False
try:
stream.write(data_block)
except (IOError, OSError) as err:
- self.to_stderr(u"\n")
- self.report_error(u'unable to write data: %s' % str(err))
+ self.to_stderr('\n')
+ self.report_error('unable to write data: %s' % str(err))
return False
+
+ # Apply rate limit
+ self.slow_down(start, now, byte_counter - resume_len)
+
+ # end measuring of one loop run
+ now = time.time()
+ after = now
+
+ # Adjust block size
if not self.params.get('noresizebuffer', False):
block_size = self.best_block_size(after - before, len(data_block))
+ before = after
+
# Progress message
- speed = self.calc_speed(start, time.time(), byte_counter - resume_len)
+ speed = self.calc_speed(start, now, byte_counter - resume_len)
if data_len is None:
eta = percent = None
else:
@@ -184,14 +199,11 @@ class HttpFD(FileDownloader):
if is_test and byte_counter == data_len:
break
- # Apply rate limit
- self.slow_down(start, byte_counter - resume_len)
-
if stream is None:
- self.to_stderr(u"\n")
- self.report_error(u'Did not get any data blocks')
+ self.to_stderr('\n')
+ self.report_error('Did not get any data blocks')
return False
- if tmpfilename != u'-':
+ if tmpfilename != '-':
stream.close()
self.report_finish(data_len_str, (time.time() - start))
if data_len is not None and byte_counter != data_len:
diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py
index 4de7f15f4..72cef30ea 100644
--- a/youtube_dl/downloader/mplayer.py
+++ b/youtube_dl/downloader/mplayer.py
@@ -1,8 +1,11 @@
+from __future__ import unicode_literals
+
import os
import subprocess
from .common import FileDownloader
from ..utils import (
+ check_executable,
encodeFilename,
)
@@ -13,19 +16,19 @@ class MplayerFD(FileDownloader):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
- args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url]
+ args = [
+ 'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy',
+ '-dumpstream', '-dumpfile', tmpfilename, url]
# Check for mplayer first
- try:
- subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
- except (OSError, IOError):
- self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0])
+ if not check_executable('mplayer', ['-h']):
+ self.report_error('MMS or RTSP download detected but "%s" could not be run' % args[0])
return False
# Download using mplayer.
retval = subprocess.call(args)
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
+ self.to_screen('\r[%s] %s bytes' % (args[0], fsize))
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
@@ -35,6 +38,6 @@ class MplayerFD(FileDownloader):
})
return True
else:
- self.to_stderr(u"\n")
- self.report_error(u'mplayer exited with code %d' % retval)
+ self.to_stderr('\n')
+ self.report_error('mplayer exited with code %d' % retval)
return False
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
index 5eb108302..6dbbc053c 100644
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -7,14 +7,20 @@ import sys
import time
from .common import FileDownloader
+from ..compat import compat_str
from ..utils import (
check_executable,
- compat_str,
encodeFilename,
format_bytes,
+ get_exe_version,
)
+def rtmpdump_version():
+ return get_exe_version(
+ 'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)')
+
+
class RtmpFD(FileDownloader):
def real_download(self, filename, info_dict):
def run_rtmpdump(args):
@@ -40,13 +46,13 @@ class RtmpFD(FileDownloader):
continue
mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
if mobj:
- downloaded_data_len = int(float(mobj.group(1))*1024)
+ downloaded_data_len = int(float(mobj.group(1)) * 1024)
percent = float(mobj.group(2))
if not resume_percent:
resume_percent = percent
resume_downloaded_data_len = downloaded_data_len
- eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent)
- speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len)
+ eta = self.calc_eta(start, time.time(), 100 - resume_percent, percent - resume_percent)
+ speed = self.calc_speed(start, time.time(), downloaded_data_len - resume_downloaded_data_len)
data_len = None
if percent > 0:
data_len = int(downloaded_data_len * 100 / percent)
@@ -66,7 +72,7 @@ class RtmpFD(FileDownloader):
# no percent for live streams
mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line)
if mobj:
- downloaded_data_len = int(float(mobj.group(1))*1024)
+ downloaded_data_len = int(float(mobj.group(1)) * 1024)
time_now = time.time()
speed = self.calc_speed(start, time_now, downloaded_data_len)
self.report_progress_live_stream(downloaded_data_len, speed, time_now - start)
@@ -82,7 +88,7 @@ class RtmpFD(FileDownloader):
if not cursor_in_new_line:
self.to_screen('')
cursor_in_new_line = True
- self.to_screen('[rtmpdump] '+line)
+ self.to_screen('[rtmpdump] ' + line)
proc.wait()
if not cursor_in_new_line:
self.to_screen('')
@@ -146,19 +152,7 @@ class RtmpFD(FileDownloader):
else:
subprocess_encoding = None
- if self.params.get('verbose', False):
- if subprocess_encoding:
- str_args = [
- a.decode(subprocess_encoding) if isinstance(a, bytes) else a
- for a in args]
- else:
- str_args = args
- try:
- import pipes
- shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
- except ImportError:
- shell_quote = repr
- self.to_screen('[debug] rtmpdump command line: ' + shell_quote(str_args))
+ self._debug_cmd(args, subprocess_encoding, exe='rtmpdump')
RD_SUCCESS = 0
RD_FAILED = 1
@@ -174,12 +168,12 @@ class RtmpFD(FileDownloader):
while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen('[rtmpdump] %s bytes' % prevsize)
- time.sleep(5.0) # This seems to be needed
+ time.sleep(5.0) # This seems to be needed
retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED])
cursize = os.path.getsize(encodeFilename(tmpfilename))
if prevsize == cursize and retval == RD_FAILED:
break
- # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
+ # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = RD_SUCCESS
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 070f9ff19..3f7ca6f7d 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,8 +1,14 @@
+from __future__ import unicode_literals
+
from .abc import ABCIE
+from .abc7news import Abc7NewsIE
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
+from .adobetv import AdobeTVIE
from .adultswim import AdultSwimIE
from .aftonbladet import AftonbladetIE
+from .aljazeera import AlJazeeraIE
+from .alphaporno import AlphaPornoIE
from .anitube import AnitubeIE
from .anysex import AnySexIE
from .aol import AolIE
@@ -20,19 +26,26 @@ from .arte import (
ArteTVDDCIE,
ArteTVEmbedIE,
)
-from .auengine import AUEngineIE
+from .atresplayer import AtresPlayerIE
+from .atttechchannel import ATTTechChannelIE
+from .audiomack import AudiomackIE, AudiomackAlbumIE
+from .azubu import AzubuIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
from .beeg import BeegIE
from .behindkink import BehindKinkIE
+from .bet import BetIE
+from .bild import BildIE
from .bilibili import BiliBiliIE
from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
+from .bpb import BpbIE
from .br import BRIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
+from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
from .canal13cl import Canal13clIE
@@ -43,7 +56,7 @@ from .cbsnews import CBSNewsIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
-from .cinemassacre import CinemassacreIE
+from .cinchcast import CinchcastIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
@@ -54,15 +67,21 @@ from .cnet import CNETIE
from .cnn import (
CNNIE,
CNNBlogsIE,
+ CNNArticleIE,
)
from .collegehumor import CollegeHumorIE
+from .collegerama import CollegeRamaIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
+from .comcarcoff import ComCarCoffIE
+from .commonmistakes import CommonMistakesIE
from .condenast import CondeNastIE
from .cracked import CrackedIE
from .criterion import CriterionIE
-from .crunchyroll import CrunchyrollIE
+from .crunchyroll import (
+ CrunchyrollIE,
+ CrunchyrollShowPlaylistIE
+)
from .cspan import CSpanIE
-from .d8 import D8IE
from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
@@ -74,14 +93,17 @@ from .deezer import DeezerPlaylistIE
from .dfb import DFBIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
+from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
from .drtv import DRTVIE
+from .dvtv import DVTVIE
from .dump import DumpIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
from .divxstage import DivxStageIE
from .dropbox import DropboxIE
from .ebaumsworld import EbaumsWorldIE
+from .echomsk import EchoMskIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE
@@ -94,6 +116,7 @@ from .elpais import ElPaisIE
from .empflix import EMPFlixIE
from .engadget import EngadgetIE
from .eporner import EpornerIE
+from .eroprofile import EroProfileIE
from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
@@ -111,7 +134,10 @@ from .fktv import (
FKTVPosteckeIE,
)
from .flickr import FlickrIE
+from .folketinget import FolketingetIE
from .fourtube import FourTubeIE
+from .foxgay import FoxgayIE
+from .foxnews import FoxNewsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
@@ -123,6 +149,7 @@ from .francetv import (
)
from .freesound import FreesoundIE
from .freespeech import FreespeechIE
+from .freevideo import FreeVideoIE
from .funnyordie import FunnyOrDieIE
from .gamekings import GamekingsIE
from .gameone import (
@@ -134,18 +161,26 @@ from .gamestar import GameStarIE
from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
+from .giantbomb import GiantBombIE
+from .giga import GigaIE
+from .glide import GlideIE
from .globo import GloboIE
from .godtube import GodTubeIE
+from .goldenmoustache import GoldenMoustacheIE
from .golem import GolemIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .gorillavid import GorillaVidIE
from .goshgay import GoshgayIE
from .grooveshark import GroovesharkIE
+from .groupon import GrouponIE
from .hark import HarkIE
+from .hearthisat import HearThisAtIE
from .heise import HeiseIE
+from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
+from .hitbox import HitboxIE, HitboxLiveIE
from .hornbunny import HornBunnyIE
from .hostingbulk import HostingBulkIE
from .hotnewhiphop import HotNewHipHopIE
@@ -175,6 +210,7 @@ from .jove import JoveIE
from .jukebox import JukeboxIE
from .jpopsukitv import JpopsukiIE
from .kankan import KankanIE
+from .karaoketv import KaraoketvIE
from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
@@ -183,6 +219,7 @@ from .kontrtube import KontrTubeIE
from .krasview import KrasViewIE
from .ku6 import Ku6IE
from .la7 import LA7IE
+from .laola1tv import Laola1TvIE
from .lifenews import LifeNewsIE
from .liveleak import LiveLeakIE
from .livestream import (
@@ -190,6 +227,7 @@ from .livestream import (
LivestreamOriginalIE,
LivestreamShortenerIE,
)
+from .lnkgo import LnkGoIE
from .lrt import LRTIE
from .lynda import (
LyndaIE,
@@ -203,6 +241,7 @@ from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mgoon import MgoonIE
+from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE
@@ -229,9 +268,10 @@ from .muenchentv import MuenchenTVIE
from .musicplayon import MusicPlayOnIE
from .musicvault import MusicVaultIE
from .muzu import MuzuTVIE
-from .myspace import MySpaceIE
+from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
+from .myvidster import MyVidsterIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
@@ -240,12 +280,14 @@ from .nbc import (
)
from .ndr import NDRIE
from .ndtv import NDTVIE
+from .netzkino import NetzkinoIE
+from .nerdcubed import NerdCubedFeedIE
from .newgrounds import NewgroundsIE
from .newstube import NewstubeIE
from .nfb import NFBIE
from .nfl import NFLIE
from .nhl import NHLIE, NHLVideocenterIE
-from .niconico import NiconicoIE
+from .niconico import NiconicoIE, NiconicoPlaylistIE
from .ninegag import NineGagIE
from .noco import NocoIE
from .normalboots import NormalbootsIE
@@ -255,6 +297,7 @@ from .nowness import NownessIE
from .nowvideo import NowVideoIE
from .npo import (
NPOIE,
+ NPOLiveIE,
TegenlichtVproIE,
)
from .nrk import (
@@ -266,6 +309,7 @@ from .nytimes import NYTimesIE
from .nuvid import NuvidIE
from .oktoberfesttv import OktoberfestTVIE
from .ooyala import OoyalaIE
+from .openfilm import OpenFilmIE
from .orf import (
ORFTVthekIE,
ORFOE1IE,
@@ -274,6 +318,7 @@ from .orf import (
from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
+from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
from .planetaplay import PlanetaPlayIE
from .played import PlayedIE
@@ -287,24 +332,31 @@ from .pornoxo import PornoXOIE
from .promptfile import PromptFileIE
from .prosiebensat1 import ProSiebenSat1IE
from .pyvideo import PyvideoIE
+from .quickvid import QuickVidIE
+from .radiode import RadioDeIE
+from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .rai import RaiIE
from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
+from .restudy import RestudyIE
from .reverbnation import ReverbNationIE
from .ringtv import RingTVIE
from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtbf import RTBFIE
+from .rte import RteIE
from .rtlnl import RtlXlIE
from .rtlnow import RTLnowIE
+from .rtp import RTPIE
from .rts import RTSIE
from .rtve import RTVEALaCartaIE, RTVELiveIE
from .ruhd import RUHDIE
from .rutube import (
RutubeIE,
RutubeChannelIE,
+ RutubeEmbedIE,
RutubeMovieIE,
RutubePersonIE,
)
@@ -314,7 +366,10 @@ from .savefrom import SaveFromIE
from .sbs import SBSIE
from .scivee import SciVeeIE
from .screencast import ScreencastIE
+from .screencastomatic import ScreencastOMaticIE
+from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
from .servingsys import ServingSysIE
+from .sexu import SexuIE
from .sexykarma import SexyKarmaIE
from .shared import SharedIE
from .sharesix import ShareSixIE
@@ -349,16 +404,19 @@ from .spike import SpikeIE
from .sport5 import Sport5IE
from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE
+from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
+from .streetvoice import StreetVoiceIE
from .sunporno import SunPornoIE
from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE
from .tapely import TapelyIE
+from .tass import TassIE
from .teachertube import (
TeacherTubeIE,
TeacherTubeUserIE,
@@ -367,9 +425,13 @@ from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
+from .telebruxelles import TeleBruxellesIE
+from .telecinco import TelecincoIE
from .telemb import TeleMBIE
+from .teletask import TeleTaskIE
from .tenplay import TenPlayIE
from .testurl import TestURLIE
+from .testtube import TestTubeIE
from .tf1 import TF1IE
from .theonion import TheOnionIE
from .theplatform import ThePlatformIE
@@ -377,6 +439,7 @@ from .thesixtyone import TheSixtyOneIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
+from .tmz import TMZIE
from .tnaflix import TNAFlixIE
from .thvideo import (
THVideoIE,
@@ -390,12 +453,22 @@ from .trutube import TruTubeIE
from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
+from .tunein import TuneInIE
from .turbo import TurboIE
from .tutv import TutvIE
from .tvigle import TvigleIE
-from .tvp import TvpIE
+from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE
-from .twitch import TwitchIE
+from .twentyfourvideo import TwentyFourVideoIE
+from .twitch import (
+ TwitchVideoIE,
+ TwitchChapterIE,
+ TwitchVodIE,
+ TwitchProfileIE,
+ TwitchPastBroadcastsIE,
+ TwitchBookmarksIE,
+ TwitchStreamIE,
+)
from .ubu import UbuIE
from .udemy import (
UdemyIE,
@@ -411,6 +484,7 @@ from .vesti import VestiIE
from .vevo import VevoIE
from .vgtv import VGTVIE
from .vh1 import VH1IE
+from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
@@ -421,6 +495,8 @@ from .videopremium import VideoPremiumIE
from .videott import VideoTtIE
from .videoweed import VideoWeedIE
from .vidme import VidmeIE
+from .vidzi import VidziIE
+from .vier import VierIE, VierVideosIE
from .vimeo import (
VimeoIE,
VimeoAlbumIE,
@@ -437,9 +513,13 @@ from .vine import (
VineUserIE,
)
from .viki import VikiIE
-from .vk import VKIE
+from .vk import (
+ VKIE,
+ VKUserVideosIE,
+)
from .vodlocker import VodlockerIE
from .vporn import VpornIE
+from .vrt import VRTIE
from .vube import VubeIE
from .vuclip import VuClipIE
from .vulture import VultureIE
@@ -452,6 +532,7 @@ from .wdr import (
WDRMobileIE,
WDRMausIE,
)
+from .webofstories import WebOfStoriesIE
from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
@@ -460,13 +541,16 @@ from .wrzuta import WrzutaIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xhamster import XHamsterIE
+from .xminus import XMinusIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
from .xtube import XTubeUserIE, XTubeIE
+from .xxxymovies import XXXYMoviesIE
from .yahoo import (
YahooIE,
YahooSearchIE,
)
+from .yesjapan import YesJapanIE
from .ynet import YnetIE
from .youjizz import YouJizzIE
from .youku import YoukuIE
@@ -484,14 +568,16 @@ from .youtube import (
YoutubeSearchURLIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
- YoutubeTopListIE,
+ YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeWatchLaterIE,
)
-
-from .zdf import ZDFIE
-
+from .zdf import ZDFIE, ZDFChannelIE
+from .zingmp3 import (
+ ZingMp3SongIE,
+ ZingMp3AlbumIE,
+)
_ALL_CLASSES = [
klass
@@ -508,6 +594,17 @@ def gen_extractors():
return [klass() for klass in _ALL_CLASSES]
+def list_extractors(age_limit):
+ """
+ Return a list of extractors that are suitable for the given age,
+ sorted by extractor ID.
+ """
+
+ return sorted(
+ filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()),
+ key=lambda ie: ie.IE_NAME.lower())
+
+
def get_info_extractor(ie_name):
"""Returns the info extractor class with the given ie_name"""
- return globals()[ie_name+'IE']
+ return globals()[ie_name + 'IE']
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py
index 69f89320c..dc0fb85d6 100644
--- a/youtube_dl/extractor/abc.py
+++ b/youtube_dl/extractor/abc.py
@@ -11,13 +11,13 @@ class ABCIE(InfoExtractor):
_VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.abc.net.au/news/2014-07-25/bringing-asylum-seekers-to-australia-would-give/5624716',
- 'md5': 'dad6f8ad011a70d9ddf887ce6d5d0742',
+ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
+ 'md5': 'cb3dd03b18455a661071ee1e28344d9f',
'info_dict': {
- 'id': '5624716',
+ 'id': '5868334',
'ext': 'mp4',
- 'title': 'Bringing asylum seekers to Australia would give them right to asylum claims: professor',
- 'description': 'md5:ba36fa5e27e5c9251fd929d339aea4af',
+ 'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
+ 'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
},
}
diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py
new file mode 100644
index 000000000..c04949c21
--- /dev/null
+++ b/youtube_dl/extractor/abc7news.py
@@ -0,0 +1,68 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class Abc7NewsIE(InfoExtractor):
+ _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P<display_id>[^/]+))?/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
+ 'info_dict': {
+ 'id': '472581',
+ 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
+ 'ext': 'mp4',
+ 'title': 'East Bay museum celebrates history of synthesized music',
+ 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1421123075,
+ 'upload_date': '20150113',
+ 'uploader': 'Jonathan Bloom',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://abc7news.com/472581',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ m3u8 = self._html_search_meta(
+ 'contentURL', webpage, 'm3u8 url', fatal=True)
+
+ formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4')
+ self._sort_formats(formats)
+
+ title = self._og_search_title(webpage).strip()
+ description = self._og_search_description(webpage).strip()
+ thumbnail = self._og_search_thumbnail(webpage)
+ timestamp = parse_iso8601(self._search_regex(
+ r'<div class="meta">\s*<time class="timeago" datetime="([^"]+)">',
+ webpage, 'upload date', fatal=False))
+ uploader = self._search_regex(
+ r'rel="author">([^<]+)</a>',
+ webpage, 'uploader', default=None)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py
index c983ef0f5..47313fba8 100644
--- a/youtube_dl/extractor/academicearth.py
+++ b/youtube_dl/extractor/academicearth.py
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -18,15 +19,14 @@ class AcademicEarthCourseIE(InfoExtractor):
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- playlist_id = m.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
title = self._html_search_regex(
- r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, u'title')
+ r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<p class="excerpt"[^>]*?>(.*?)</p>',
- webpage, u'description', fatal=False)
+ webpage, 'description', fatal=False)
urls = re.findall(
r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
webpage)
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index fcf296057..203936e54 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -3,19 +3,19 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_HTTPError,
compat_str,
compat_urllib_parse,
compat_urllib_parse_urlparse,
-
+)
+from ..utils import (
ExtractorError,
)
class AddAnimeIE(InfoExtractor):
-
- _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
+ _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<id>[\w_]+)(?:.*)'
_TEST = {
'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
'md5': '72954ea10bc979ab5e2eb288b21425a0',
@@ -28,9 +28,9 @@ class AddAnimeIE(InfoExtractor):
}
def _real_extract(self, url):
+ video_id = self._match_id(url)
+
try:
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
webpage = self._download_webpage(url, video_id)
except ExtractorError as ee:
if not isinstance(ee.cause, compat_HTTPError) or \
@@ -48,7 +48,7 @@ class AddAnimeIE(InfoExtractor):
r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
redir_webpage)
if av is None:
- raise ExtractorError(u'Cannot find redirect math task')
+ raise ExtractorError('Cannot find redirect math task')
av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3))
parsed_url = compat_urllib_parse_urlparse(url)
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
new file mode 100644
index 000000000..28e07f8b0
--- /dev/null
+++ b/youtube_dl/extractor/adobetv.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ unified_strdate,
+ str_to_int,
+)
+
+
+class AdobeTVIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.adobe\.com/watch/[^/]+/(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/',
+ 'md5': '9bc5727bcdd55251f35ad311ca74fa1e',
+ 'info_dict': {
+ 'id': 'quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop',
+ 'ext': 'mp4',
+ 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop',
+ 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'upload_date': '20110914',
+ 'duration': 60,
+ 'view_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ player = self._parse_json(
+ self._search_regex(r'html5player:\s*({.+?})\s*\n', webpage, 'player'),
+ video_id)
+
+ title = player.get('title') or self._search_regex(
+ r'data-title="([^"]+)"', webpage, 'title')
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ upload_date = unified_strdate(
+ self._html_search_meta('datepublished', webpage, 'upload date'))
+
+ duration = parse_duration(
+ self._html_search_meta('duration', webpage, 'duration')
+ or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration'))
+
+ view_count = str_to_int(self._search_regex(
+ r'<div class="views">\s*Views?:\s*([\d,.]+)\s*</div>',
+ webpage, 'view count'))
+
+ formats = [{
+ 'url': source['src'],
+ 'format_id': source.get('quality') or source['src'].split('-')[-1].split('.')[0] or None,
+ 'tbr': source.get('bitrate'),
+ } for source in player['sources']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index b4b40f2d4..502a9c25a 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -2,122 +2,150 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ xpath_text,
+ float_or_none,
+)
+
class AdultSwimIE(InfoExtractor):
- _VALID_URL = r'https?://video\.adultswim\.com/(?P<path>.+?)(?:\.html)?(?:\?.*)?(?:#.*)?$'
- _TEST = {
- 'url': 'http://video.adultswim.com/rick-and-morty/close-rick-counters-of-the-rick-kind.html?x=y#title',
+ _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?'
+
+ _TESTS = [{
+ 'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
'playlist': [
{
- 'md5': '4da359ec73b58df4575cd01a610ba5dc',
+ 'md5': '247572debc75c7652f253c8daa51a14d',
'info_dict': {
- 'id': '8a250ba1450996e901453d7f02ca02f5',
+ 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0',
'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 1',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
+ 'title': 'Rick and Morty - Pilot Part 1',
+ 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ },
},
{
- 'md5': 'ffbdf55af9331c509d95350bd0cc1819',
+ 'md5': '77b0e037a4b20ec6b98671c4c379f48d',
'info_dict': {
- 'id': '8a250ba1450996e901453d7f4bd102f6',
+ 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3',
'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 2',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
- },
- {
- 'md5': 'b92409635540304280b4b6c36bd14a0a',
- 'info_dict': {
- 'id': '8a250ba1450996e901453d7fa73c02f7',
- 'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 3',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
+ 'title': 'Rick and Morty - Pilot Part 4',
+ 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ },
},
+ ],
+ 'info_dict': {
+ 'title': 'Rick and Morty - Pilot',
+ 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ }
+ }, {
+ 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
+ 'playlist': [
{
- 'md5': 'e8818891d60e47b29cd89d7b0278156d',
+ 'md5': '2eb5c06d0f9a1539da3718d897f13ec5',
'info_dict': {
- 'id': '8a250ba1450996e901453d7fc8ba02f8',
+ 'id': '-t8CamQlQ2aYZ49ItZCFog-0',
'ext': 'flv',
- 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 4',
- 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?',
- 'uploader': 'Rick and Morty',
- 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg'
- }
+ 'title': 'American Dad - Putting Francine Out of Business',
+ 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+ },
}
- ]
- }
-
- _video_extensions = {
- '3500': 'flv',
- '640': 'mp4',
- '150': 'mp4',
- 'ipad': 'm3u8',
- 'iphone': 'm3u8'
- }
- _video_dimensions = {
- '3500': (1280, 720),
- '640': (480, 270),
- '150': (320, 180)
- }
+ ],
+ 'info_dict': {
+ 'title': 'American Dad - Putting Francine Out of Business',
+ 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+ },
+ }]
+
+ @staticmethod
+ def find_video_info(collection, slug):
+ for video in collection.get('videos'):
+ if video.get('slug') == slug:
+ return video
+
+ @staticmethod
+ def find_collection_by_linkURL(collections, linkURL):
+ for collection in collections:
+ if collection.get('linkURL') == linkURL:
+ return collection
+
+ @staticmethod
+ def find_collection_containing_video(collections, slug):
+ for collection in collections:
+ for video in collection.get('videos'):
+ if video.get('slug') == slug:
+ return collection, video
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_path = mobj.group('path')
-
- webpage = self._download_webpage(url, video_path)
- episode_id = self._html_search_regex(
- r'<link rel="video_src" href="http://i\.adultswim\.com/adultswim/adultswimtv/tools/swf/viralplayer.swf\?id=([0-9a-f]+?)"\s*/?\s*>',
- webpage, 'episode_id')
- title = self._og_search_title(webpage)
-
- index_url = 'http://asfix.adultswim.com/asfix-svc/episodeSearch/getEpisodesByIDs?networkName=AS&ids=%s' % episode_id
- idoc = self._download_xml(index_url, title, 'Downloading episode index', 'Unable to download episode index')
-
- episode_el = idoc.find('.//episode')
- show_title = episode_el.attrib.get('collectionTitle')
- episode_title = episode_el.attrib.get('title')
- thumbnail = episode_el.attrib.get('thumbnailUrl')
- description = episode_el.find('./description').text.strip()
+ show_path = mobj.group('show_path')
+ episode_path = mobj.group('episode_path')
+ is_playlist = True if mobj.group('is_playlist') else False
+
+ webpage = self._download_webpage(url, episode_path)
+
+ # Extract the value of `bootstrappedData` from the Javascript in the page.
+ bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path)
+
+ try:
+ bootstrappedData = json.loads(bootstrappedDataJS)
+ except ValueError as ve:
+ errmsg = '%s: Failed to parse JSON ' % episode_path
+ raise ExtractorError(errmsg, cause=ve)
+
+ # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
+ # NOTE: We are only downloading one video (the current one) not the playlist
+ if is_playlist:
+ collections = bootstrappedData['playlists']['collections']
+ collection = self.find_collection_by_linkURL(collections, show_path)
+ video_info = self.find_video_info(collection, episode_path)
+
+ show_title = video_info['showTitle']
+ segment_ids = [video_info['videoPlaybackID']]
+ else:
+ collections = bootstrappedData['show']['collections']
+ collection, video_info = self.find_collection_containing_video(collections, episode_path)
+
+ show = bootstrappedData['show']
+ show_title = show['title']
+ segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
+
+ episode_id = video_info['id']
+ episode_title = video_info['title']
+ episode_description = video_info['description']
+ episode_duration = video_info.get('duration')
entries = []
- segment_els = episode_el.findall('./segments/segment')
+ for part_num, segment_id in enumerate(segment_ids):
+ segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id
- for part_num, segment_el in enumerate(segment_els):
- segment_id = segment_el.attrib.get('id')
- segment_title = '%s %s part %d' % (show_title, episode_title, part_num + 1)
- thumbnail = segment_el.attrib.get('thumbnailUrl')
- duration = segment_el.attrib.get('duration')
+ segment_title = '%s - %s' % (show_title, episode_title)
+ if len(segment_ids) > 1:
+ segment_title += ' Part %d' % (part_num + 1)
- segment_url = 'http://asfix.adultswim.com/asfix-svc/episodeservices/getCvpPlaylist?networkName=AS&id=%s' % segment_id
idoc = self._download_xml(
segment_url, segment_title,
'Downloading segment information', 'Unable to download segment information')
+ segment_duration = float_or_none(
+ xpath_text(idoc, './/trt', 'segment duration').strip())
+
formats = []
file_els = idoc.findall('.//files/file')
for file_el in file_els:
bitrate = file_el.attrib.get('bitrate')
- type = file_el.attrib.get('type')
- width, height = self._video_dimensions.get(bitrate, (None, None))
+ ftype = file_el.attrib.get('type')
+
formats.append({
- 'format_id': '%s-%s' % (bitrate, type),
- 'url': file_el.text,
- 'ext': self._video_extensions.get(bitrate, 'mp4'),
+ 'format_id': '%s_%s' % (bitrate, ftype),
+ 'url': file_el.text.strip(),
# The bitrate may not be a number (for example: 'iphone')
'tbr': int(bitrate) if bitrate.isdigit() else None,
- 'height': height,
- 'width': width
+ 'quality': 1 if ftype == 'hd' else -1
})
self._sort_formats(formats)
@@ -126,18 +154,16 @@ class AdultSwimIE(InfoExtractor):
'id': segment_id,
'title': segment_title,
'formats': formats,
- 'uploader': show_title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'description': description
+ 'duration': segment_duration,
+ 'description': episode_description
})
return {
'_type': 'playlist',
'id': episode_id,
- 'display_id': video_path,
+ 'display_id': episode_path,
'entries': entries,
- 'title': '%s %s' % (show_title, episode_title),
- 'description': description,
- 'thumbnail': thumbnail
+ 'title': '%s - %s' % (show_title, episode_title),
+ 'description': episode_description,
+ 'duration': episode_duration
}
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
new file mode 100644
index 000000000..612708e25
--- /dev/null
+++ b/youtube_dl/extractor/aljazeera.py
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class AlJazeeraIE(InfoExtractor):
+ _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
+
+ _TEST = {
+ 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
+ 'info_dict': {
+ 'id': '3792260579001',
+ 'ext': 'mp4',
+ 'title': 'The Slum - Episode 1: Deliverance',
+ 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
+ 'uploader': 'Al Jazeera English',
+ },
+ 'add_ie': ['Brightcove'],
+ }
+
+ def _real_extract(self, url):
+ program_name = self._match_id(url)
+ webpage = self._download_webpage(url, program_name)
+ brightcove_id = self._search_regex(
+ r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id')
+
+ return {
+ '_type': 'url',
+ 'url': (
+ 'brightcove:'
+ 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc'
+ '&%40videoPlayer={0}'.format(brightcove_id)
+ ),
+ 'ie_key': 'Brightcove',
+ }
diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py
index 7bd797884..7d65b8193 100644
--- a/youtube_dl/extractor/allocine.py
+++ b/youtube_dl/extractor/allocine.py
@@ -5,15 +5,14 @@ import re
import json
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- compat_str,
qualities,
- determine_ext,
)
class AllocineIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=)(?P<id>[0-9]+)(?:\.html)?'
+ _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?P<typ>article|video|film)/(fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P<id>[0-9]+)(?:\.html)?'
_TESTS = [{
'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html',
@@ -22,7 +21,7 @@ class AllocineIE(InfoExtractor):
'id': '19546517',
'ext': 'mp4',
'title': 'Astérix - Le Domaine des Dieux Teaser VF',
- 'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
+ 'description': 'md5:abcd09ce503c6560512c14ebfdb720d2',
'thumbnail': 're:http://.*\.jpg',
},
}, {
@@ -45,6 +44,9 @@ class AllocineIE(InfoExtractor):
'description': 'md5:71742e3a74b0d692c7fce0dd2017a4ac',
'thumbnail': 're:http://.*\.jpg',
},
+ }, {
+ 'url': 'http://www.allocine.fr/video/video-19550147/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -75,9 +77,7 @@ class AllocineIE(InfoExtractor):
'format_id': format_id,
'quality': quality(format_id),
'url': v,
- 'ext': determine_ext(v),
})
-
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py
new file mode 100644
index 000000000..c34719d1f
--- /dev/null
+++ b/youtube_dl/extractor/alphaporno.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ parse_duration,
+ parse_filesize,
+ int_or_none,
+)
+
+
+class AlphaPornoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/',
+ 'md5': 'feb6d3bba8848cd54467a87ad34bd38e',
+ 'info_dict': {
+ 'id': '258807',
+ 'display_id': 'sensual-striptease-porn-with-samantha-alexandra',
+ 'ext': 'mp4',
+ 'title': 'Sensual striptease porn with Samantha Alexandra',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'timestamp': 1418694611,
+ 'upload_date': '20141216',
+ 'duration': 387,
+ 'filesize_approx': 54120000,
+ 'tbr': 1145,
+ 'categories': list,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None)
+
+ video_url = self._search_regex(
+ r"video_url\s*:\s*'([^']+)'", webpage, 'video url')
+ ext = self._html_search_meta(
+ 'encodingFormat', webpage, 'ext', default='.mp4')[1:]
+
+ title = self._search_regex(
+ [r'<meta content="([^"]+)" itemprop="description">',
+ r'class="title" itemprop="name">([^<]+)<'],
+ webpage, 'title')
+ thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail')
+ timestamp = parse_iso8601(self._html_search_meta(
+ 'uploadDate', webpage, 'upload date'))
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration'))
+ filesize_approx = parse_filesize(self._html_search_meta(
+ 'contentSize', webpage, 'file size'))
+ bitrate = int_or_none(self._html_search_meta(
+ 'bitrate', webpage, 'bitrate'))
+ categories = self._html_search_meta(
+ 'keywords', webpage, 'categories', default='').split(',')
+
+ age_limit = self._rta_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'filesize_approx': filesize_approx,
+ 'tbr': bitrate,
+ 'categories': categories,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
index 47f8e4157..b51eafc45 100644
--- a/youtube_dl/extractor/aol.py
+++ b/youtube_dl/extractor/aol.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .fivemin import FiveMinIE
class AolIE(InfoExtractor):
@@ -42,31 +41,30 @@ class AolIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
-
playlist_id = mobj.group('playlist_id')
- if playlist_id and not self._downloader.params.get('noplaylist'):
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ if not playlist_id or self._downloader.params.get('noplaylist'):
+ return self.url_result('5min:%s' % video_id)
- webpage = self._download_webpage(url, playlist_id)
- title = self._html_search_regex(
- r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
- playlist_html = self._search_regex(
- r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
- 'playlist HTML')
- entries = [{
- '_type': 'url',
- 'url': 'aol-video:%s' % m.group('id'),
- 'ie_key': 'Aol',
- } for m in re.finditer(
- r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
- playlist_html)]
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'display_id': mobj.group('playlist_display_id'),
- 'title': title,
- 'entries': entries,
- }
+ webpage = self._download_webpage(url, playlist_id)
+ title = self._html_search_regex(
+ r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
+ playlist_html = self._search_regex(
+ r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
+ 'playlist HTML')
+ entries = [{
+ '_type': 'url',
+ 'url': 'aol-video:%s' % m.group('id'),
+ 'ie_key': 'Aol',
+ } for m in re.finditer(
+ r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
+ playlist_html)]
- return FiveMinIE._build_result(video_id)
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'display_id': mobj.group('playlist_display_id'),
+ 'title': title,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py
index 748608826..15006336f 100644
--- a/youtube_dl/extractor/aparat.py
+++ b/youtube_dl/extractor/aparat.py
@@ -1,5 +1,4 @@
-#coding: utf-8
-
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -26,8 +25,7 @@ class AparatIE(InfoExtractor):
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
+ video_id = self._match_id(url)
# Note: There is an easier-to-parse configuration at
# http://www.aparat.com/video/video/config/videohash/%video_id
@@ -40,15 +38,15 @@ class AparatIE(InfoExtractor):
for i, video_url in enumerate(video_urls):
req = HEADRequest(video_url)
res = self._request_webpage(
- req, video_id, note=u'Testing video URL %d' % i, errnote=False)
+ req, video_id, note='Testing video URL %d' % i, errnote=False)
if res:
break
else:
- raise ExtractorError(u'No working video URLs found')
+ raise ExtractorError('No working video URLs found')
- title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, u'title')
+ title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title')
thumbnail = self._search_regex(
- r'\s+image:\s*"([^"]+)"', webpage, u'thumbnail', fatal=False)
+ r'\s+image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 4359b88d1..70621946d 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -4,8 +4,8 @@ import re
import json
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
- compat_urlparse,
int_or_none,
)
@@ -70,15 +70,17 @@ class AppleTrailersIE(InfoExtractor):
uploader_id = mobj.group('company')
playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
+
def fix_html(s):
s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed
# like: http://trailers.apple.com/trailers/wb/gravity/
+
def _clean_json(m):
return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
s = re.sub(self._JSON_RE, _clean_json, s)
- s = '<html>' + s + u'</html>'
+ s = '<html>%s</html>' % s
return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
@@ -86,7 +88,7 @@ class AppleTrailersIE(InfoExtractor):
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
trailer_info_json = self._search_regex(self._JSON_RE,
- on_click, 'trailer info')
+ on_click, 'trailer info')
trailer_info = json.loads(trailer_info_json)
title = trailer_info['title']
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
@@ -127,7 +129,9 @@ class AppleTrailersIE(InfoExtractor):
'thumbnail': thumbnail,
'upload_date': upload_date,
'uploader_id': uploader_id,
- 'user_agent': 'QuickTime compatible (youtube-dl)',
+ 'http_headers': {
+ 'User-Agent': 'QuickTime compatible (youtube-dl)',
+ },
})
return {
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index 34ce8429b..9fc35a42b 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -1,42 +1,48 @@
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
-from ..utils import (
- unified_strdate,
-)
+from ..utils import unified_strdate
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
- _VALID_URL = r'(?:https?://)?(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
- _TEST = {
- "url": "http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
- 'file': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
+ _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+ _TESTS = [{
+ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'md5': '8af1d4cf447933ed3c7f4871162602db',
'info_dict': {
- "title": "1968 Demo - FJCC Conference Presentation Reel #1",
- "description": "Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
- "upload_date": "19681210",
- "uploader": "SRI International"
+ 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+ 'ext': 'ogv',
+ 'title': '1968 Demo - FJCC Conference Presentation Reel #1',
+ 'description': 'md5:1780b464abaca9991d8968c877bb53ed',
+ 'upload_date': '19681210',
+ 'uploader': 'SRI International'
+ }
+ }, {
+ 'url': 'https://archive.org/details/Cops1922',
+ 'md5': '18f2a19e6d89af8425671da1cf3d4e04',
+ 'info_dict': {
+ 'id': 'Cops1922',
+ 'ext': 'ogv',
+ 'title': 'Buster Keaton\'s "Cops" (1922)',
+ 'description': 'md5:70f72ee70882f713d4578725461ffcc3',
}
- }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
json_url = url + ('?' if '?' in url else '&') + 'output=json'
- json_data = self._download_webpage(json_url, video_id)
- data = json.loads(json_data)
+ data = self._download_json(json_url, video_id)
+
+ def get_optional(data_dict, field):
+ return data_dict['metadata'].get(field, [None])[0]
- title = data['metadata']['title'][0]
- description = data['metadata']['description'][0]
- uploader = data['metadata']['creator'][0]
- upload_date = unified_strdate(data['metadata']['date'][0])
+ title = get_optional(data, 'title')
+ description = get_optional(data, 'description')
+ uploader = get_optional(data, 'creator')
+ upload_date = unified_strdate(get_optional(data, 'date'))
formats = [
{
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 8de9c11ea..967bd865c 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from .generic import GenericIE
from ..utils import (
determine_ext,
ExtractorError,
@@ -12,6 +13,7 @@ from ..utils import (
parse_duration,
unified_strdate,
xpath_text,
+ parse_xml,
)
@@ -54,6 +56,11 @@ class ARDMediathekIE(InfoExtractor):
if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
+ if re.search(r'[\?&]rss($|[=&])', url):
+ doc = parse_xml(webpage)
+ if doc.tag == 'rss':
+ return GenericIE()._extract_rss(url, video_id, doc)
+
title = self._html_search_regex(
[r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
r'<meta name="dcterms.title" content="(.*?)"/>',
@@ -185,4 +192,3 @@ class ARDIE(InfoExtractor):
'upload_date': upload_date,
'thumbnail': thumbnail,
}
-
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index a42ebff8e..929dd3cc5 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -5,17 +5,15 @@ import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
find_xpath_attr,
unified_strdate,
- determine_ext,
get_element_by_id,
- compat_str,
get_element_by_attribute,
int_or_none,
+ qualities,
)
-# There are different sources of video in arte.tv, the extraction process
+# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
@@ -39,7 +37,7 @@ class ArteTvIE(InfoExtractor):
config_xml_url, video_id, note='Downloading configuration')
formats = [{
- 'forma_id': q.attrib['quality'],
+ 'format_id': q.attrib['quality'],
# The playpath starts at 'mp4:', if we don't manually
# split the url, rtmpdump will incorrectly parse them
'url': q.text.split('mp4:', 1)[0],
@@ -91,86 +89,66 @@ class ArteTVPlus7IE(InfoExtractor):
if not upload_date_str:
upload_date_str = player_info.get('VDA', '').split(' ')[0]
+ title = player_info['VTI'].strip()
+ subtitle = player_info.get('VSU', '').strip()
+ if subtitle:
+ title += ' - %s' % subtitle
+
info_dict = {
'id': player_info['VID'],
- 'title': player_info['VTI'],
+ 'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
+ qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ'])
- all_formats = []
+ formats = []
for format_id, format_dict in player_info['VSR'].items():
- fmt = dict(format_dict)
- fmt['format_id'] = format_id
- all_formats.append(fmt)
- # Some formats use the m3u8 protocol
- all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats))
- def _match_lang(f):
- if f.get('versionCode') is None:
- return True
- # Return true if that format is in the language of the url
- if lang == 'fr':
- l = 'F'
- elif lang == 'de':
- l = 'A'
- else:
- l = lang
- regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
- return any(re.match(r, f['versionCode']) for r in regexes)
- # Some formats may not be in the same language as the url
- # TODO: Might want not to drop videos that does not match requested language
- # but to process those formats with lower precedence
- formats = filter(_match_lang, all_formats)
- formats = list(formats) # in python3 filter returns an iterator
- if not formats:
- # Some videos are only available in the 'Originalversion'
- # they aren't tagged as being in French or German
- # Sometimes there are neither videos of requested lang code
- # nor original version videos available
- # For such cases we just take all_formats as is
- formats = all_formats
- if not formats:
- raise ExtractorError('The formats list is empty')
-
- if re.match(r'[A-Z]Q', formats[0]['quality']) is not None:
- def sort_key(f):
- return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
- else:
- def sort_key(f):
- versionCode = f.get('versionCode')
- if versionCode is None:
- versionCode = ''
- return (
- # Sort first by quality
- int(f.get('height', -1)),
- int(f.get('bitrate', -1)),
- # The original version with subtitles has lower relevance
- re.match(r'VO-ST(F|A)', versionCode) is None,
- # The version with sourds/mal subtitles has also lower relevance
- re.match(r'VO?(F|A)-STM\1', versionCode) is None,
- # Prefer http downloads over m3u8
- 0 if f['url'].endswith('m3u8') else 1,
- )
- formats = sorted(formats, key=sort_key)
- def _format(format_info):
- info = {
- 'format_id': format_info['format_id'],
- 'format_note': '%s, %s' % (format_info.get('versionCode'), format_info.get('versionLibelle')),
- 'width': int_or_none(format_info.get('width')),
- 'height': int_or_none(format_info.get('height')),
- 'tbr': int_or_none(format_info.get('bitrate')),
+ f = dict(format_dict)
+ versionCode = f.get('versionCode')
+
+ langcode = {
+ 'fr': 'F',
+ 'de': 'A',
+ }.get(lang, lang)
+ lang_rexs = [r'VO?%s' % langcode, r'VO?.-ST%s' % langcode]
+ lang_pref = (
+ None if versionCode is None else (
+ 10 if any(re.match(r, versionCode) for r in lang_rexs)
+ else -10))
+ source_pref = 0
+ if versionCode is not None:
+ # The original version with subtitles has lower relevance
+ if re.match(r'VO-ST(F|A)', versionCode):
+ source_pref -= 10
+ # The version with sourds/mal subtitles has also lower relevance
+ elif re.match(r'VO?(F|A)-STM\1', versionCode):
+ source_pref -= 9
+ format = {
+ 'format_id': format_id,
+ 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
+ 'language_preference': lang_pref,
+ 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'tbr': int_or_none(f.get('bitrate')),
+ 'quality': qfunc(f.get('quality')),
+ 'source_preference': source_pref,
}
- if format_info['mediaType'] == 'rtmp':
- info['url'] = format_info['streamer']
- info['play_path'] = 'mp4:' + format_info['url']
- info['ext'] = 'flv'
+
+ if f.get('mediaType') == 'rtmp':
+ format['url'] = f['streamer']
+ format['play_path'] = 'mp4:' + f['url']
+ format['ext'] = 'flv'
else:
- info['url'] = format_info['url']
- info['ext'] = determine_ext(info['url'])
- return info
- info_dict['formats'] = [_format(f) for f in formats]
+ format['url'] = f['url']
+
+ formats.append(format)
+
+ self._sort_formats(formats)
+ info_dict['formats'] = formats
return info_dict
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
new file mode 100644
index 000000000..f016368fa
--- /dev/null
+++ b/youtube_dl/extractor/atresplayer.py
@@ -0,0 +1,163 @@
+from __future__ import unicode_literals
+
+import time
+import hmac
+
+from .subtitles import SubtitlesInfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ xpath_text,
+ ExtractorError,
+)
+
+
+class AtresPlayerIE(SubtitlesInfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html'
+ _TESTS = [
+ {
+ 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html',
+ 'md5': 'efd56753cda1bb64df52a3074f62e38a',
+ 'info_dict': {
+ 'id': 'capitulo-10-especial-solidario-nochebuena',
+ 'ext': 'mp4',
+ 'title': 'Especial Solidario de Nochebuena',
+ 'description': 'md5:e2d52ff12214fa937107d21064075bf1',
+ 'duration': 5527.6,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html',
+ 'only_matching': True,
+ },
+ ]
+
+ _USER_AGENT = 'Dalvik/1.6.0 (Linux; U; Android 4.3; GT-I9300 Build/JSS15J'
+ _MAGIC = 'QWtMLXs414Yo+c#_+Q#K@NN)'
+ _TIMESTAMP_SHIFT = 30000
+
+ _TIME_API_URL = 'http://servicios.atresplayer.com/api/admin/time.json'
+ _URL_VIDEO_TEMPLATE = 'https://servicios.atresplayer.com/api/urlVideo/{1}/{0}/{1}|{2}|{3}.json'
+ _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s'
+ _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s'
+
+ _LOGIN_URL = 'https://servicios.atresplayer.com/j_spring_security_check'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'j_username': username,
+ 'j_password': password,
+ }
+
+ request = compat_urllib_request.Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ error = self._html_search_regex(
+ r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None)
+ if error:
+ raise ExtractorError(
+ 'Unable to login: %s' % error, expected=True)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ episode_id = self._search_regex(
+ r'episode="([^"]+)"', webpage, 'episode id')
+
+ timestamp = int_or_none(self._download_webpage(
+ self._TIME_API_URL,
+ video_id, 'Downloading timestamp', fatal=False), 1000, time.time())
+ timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT)
+ token = hmac.new(
+ self._MAGIC.encode('ascii'),
+ (episode_id + timestamp_shifted).encode('utf-8')
+ ).hexdigest()
+
+ formats = []
+ for fmt in ['windows', 'android_tablet']:
+ request = compat_urllib_request.Request(
+ self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token))
+ request.add_header('User-Agent', self._USER_AGENT)
+
+ fmt_json = self._download_json(
+ request, video_id, 'Downloading %s video JSON' % fmt)
+
+ result = fmt_json.get('resultDes')
+ if result.lower() != 'ok':
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, result), expected=True)
+
+ for format_id, video_url in fmt_json['resultObject'].items():
+ if format_id == 'token' or not video_url.startswith('http'):
+ continue
+ if video_url.endswith('/Manifest'):
+ if 'geodeswowsmpra3player' in video_url:
+ f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0]
+ f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path)
+ # this videos are protected by DRM, the f4m downloader doesn't support them
+ continue
+ else:
+ f4m_url = video_url[:-9] + '/manifest.f4m'
+ formats.extend(self._extract_f4m_formats(f4m_url, video_id))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'android-%s' % format_id,
+ 'preference': 1,
+ })
+ self._sort_formats(formats)
+
+ player = self._download_json(
+ self._PLAYER_URL_TEMPLATE % episode_id,
+ episode_id)
+
+ path_data = player.get('pathData')
+
+ episode = self._download_xml(
+ self._EPISODE_URL_TEMPLATE % path_data,
+ video_id, 'Downloading episode XML')
+
+ duration = float_or_none(xpath_text(
+ episode, './media/asset/info/technical/contentDuration', 'duration'))
+
+ art = episode.find('./media/asset/info/art')
+ title = xpath_text(art, './name', 'title')
+ description = xpath_text(art, './description', 'description')
+ thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail')
+
+ subtitles = {}
+ subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle')
+ if subtitle:
+ subtitles['es'] = subtitle
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': self.extract_subtitles(video_id, subtitles),
+ }
diff --git a/youtube_dl/extractor/atttechchannel.py b/youtube_dl/extractor/atttechchannel.py
new file mode 100644
index 000000000..b01d35bb2
--- /dev/null
+++ b/youtube_dl/extractor/atttechchannel.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class ATTTechChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://techchannel\.att\.com/play-video\.cfm/([^/]+/)*(?P<id>.+)'
+ _TEST = {
+ 'url': 'http://techchannel.att.com/play-video.cfm/2014/1/27/ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
+ 'info_dict': {
+ 'id': '11316',
+ 'display_id': 'ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
+ 'ext': 'flv',
+ 'title': 'AT&T Archives : The UNIX System: Making Computers Easier to Use',
+ 'description': 'A 1982 film about UNIX is the foundation for software in use around Bell Labs and AT&T.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140127',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_url = self._search_regex(
+ r"url\s*:\s*'(rtmp://[^']+)'",
+ webpage, 'video URL')
+
+ video_id = self._search_regex(
+ r'mediaid\s*=\s*(\d+)',
+ webpage, 'video id', fatal=False)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ upload_date = unified_strdate(self._search_regex(
+ r'[Rr]elease\s+date:\s*(\d{1,2}/\d{1,2}/\d{4})',
+ webpage, 'upload date', fatal=False), False)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py
new file mode 100644
index 000000000..8bfe50214
--- /dev/null
+++ b/youtube_dl/extractor/audiomack.py
@@ -0,0 +1,139 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import time
+
+from .common import InfoExtractor
+from .soundcloud import SoundcloudIE
+from ..utils import (
+ ExtractorError,
+ url_basename,
+)
+
+
+class AudiomackIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)'
+ IE_NAME = 'audiomack'
+ _TESTS = [
+ # hosted on audiomack
+ {
+ 'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary',
+ 'info_dict':
+ {
+ 'id': '310086',
+ 'ext': 'mp3',
+ 'uploader': 'Roosh Williams',
+ 'title': 'Extraordinary'
+ }
+ },
+ # audiomack wrapper around soundcloud song
+ {
+ 'add_ie': ['Soundcloud'],
+ 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare',
+ 'info_dict': {
+ 'id': '172419696',
+ 'ext': 'mp3',
+ 'description': 'md5:1fc3272ed7a635cce5be1568c2822997',
+ 'title': 'Young Thug ft Lil Wayne - Take Kare',
+ 'uploader': 'Young Thug World',
+ 'upload_date': '20141016',
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ # URLs end with [uploader name]/[uploader title]
+ # this title is whatever the user types in, and is rarely
+ # the proper song title. Real metadata is in the api response
+ album_url_tag = self._match_id(url)
+
+ # Request the extended version of the api for extra fields like artist and title
+ api_response = self._download_json(
+ 'http://www.audiomack.com/api/music/url/song/%s?extended=1&_=%d' % (
+ album_url_tag, time.time()),
+ album_url_tag)
+
+ # API is inconsistent with errors
+ if 'url' not in api_response or not api_response['url'] or 'error' in api_response:
+ raise ExtractorError('Invalid url %s', url)
+
+ # Audiomack wraps a lot of soundcloud tracks in their branded wrapper
+ # if so, pass the work off to the soundcloud extractor
+ if SoundcloudIE.suitable(api_response['url']):
+ return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'}
+
+ return {
+ 'id': api_response.get('id', album_url_tag),
+ 'uploader': api_response.get('artist'),
+ 'title': api_response.get('title'),
+ 'url': api_response['url'],
+ }
+
+
+class AudiomackAlbumIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)'
+ IE_NAME = 'audiomack:album'
+ _TESTS = [
+ # Standard album playlist
+ {
+ 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape',
+ 'playlist_count': 15,
+ 'info_dict':
+ {
+ 'id': '812251',
+ 'title': 'Tha Tour: Part 2 (Official Mixtape)'
+ }
+ },
+ # Album playlist ripped from fakeshoredrive with no metadata
+ {
+ 'url': 'http://www.audiomack.com/album/fakeshoredrive/ppp-pistol-p-project',
+ 'playlist': [{
+ 'info_dict': {
+ 'title': '9.-heaven-or-hell-chimaca-ft-zuse-prod-by-dj-fu',
+ 'id': '9.-heaven-or-hell-chimaca-ft-zuse-prod-by-dj-fu',
+ 'ext': 'mp3',
+ }
+ }],
+ 'params': {
+ 'playliststart': 8,
+ 'playlistend': 8,
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ # URLs end with [uploader name]/[uploader title]
+ # this title is whatever the user types in, and is rarely
+ # the proper song title. Real metadata is in the api response
+ album_url_tag = self._match_id(url)
+ result = {'_type': 'playlist', 'entries': []}
+ # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata
+ # Therefore we don't know how many songs the album has and must infi-loop until failure
+ for track_no in itertools.count():
+ # Get song's metadata
+ api_response = self._download_json(
+ 'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d'
+ % (album_url_tag, track_no, time.time()), album_url_tag,
+ note='Querying song information (%d)' % (track_no + 1))
+
+ # Total failure, only occurs when url is totally wrong
+ # Won't happen in middle of valid playlist (next case)
+ if 'url' not in api_response or 'error' in api_response:
+ raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url))
+ # URL is good but song id doesn't exist - usually means end of playlist
+ elif not api_response['url']:
+ break
+ else:
+ # Pull out the album metadata and add to result (if it exists)
+ for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]:
+ if apikey in api_response and resultkey not in result:
+ result[resultkey] = api_response[apikey]
+ song_id = url_basename(api_response['url']).rpartition('.')[0]
+ result['entries'].append({
+ 'id': api_response.get('id', song_id),
+ 'uploader': api_response.get('artist'),
+ 'title': api_response.get('title', song_id),
+ 'url': api_response['url'],
+ })
+ return result
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py
deleted file mode 100644
index 20bf12550..000000000
--- a/youtube_dl/extractor/auengine.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- compat_urllib_parse,
- determine_ext,
- ExtractorError,
-)
-
-
-class AUEngineIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?auengine\.com/embed\.php\?.*?file=(?P<id>[^&]+).*?'
-
- _TEST = {
- 'url': 'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
- 'md5': '48972bdbcf1a3a2f5533e62425b41d4f',
- 'info_dict': {
- 'id': 'lfvlytY6',
- 'ext': 'mp4',
- 'title': '[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]'
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title')
- title = title.strip()
- links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
- links = map(compat_urllib_parse.unquote, links)
-
- thumbnail = None
- video_url = None
- for link in links:
- if link.endswith('.png'):
- thumbnail = link
- elif '/videos/' in link:
- video_url = link
- if not video_url:
- raise ExtractorError('Could not find video URL')
- ext = '.' + determine_ext(video_url)
- if ext == title[-len(ext):]:
- title = title[:-len(ext)]
-
- return {
- 'id': video_id,
- 'url': video_url,
- 'title': title,
- 'thumbnail': thumbnail,
- 'http_referer': 'http://www.auengine.com/flowplayer/flowplayer.commercial-3.2.14.swf',
- }
diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py
new file mode 100644
index 000000000..0961d339f
--- /dev/null
+++ b/youtube_dl/extractor/azubu.py
@@ -0,0 +1,93 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class AzubuIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?azubu\.tv/[^/]+#!/play/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1',
+ 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4',
+ 'info_dict': {
+ 'id': '15575',
+ 'ext': 'mp4',
+ 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1',
+ 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01',
+ 'thumbnail': 're:^https?://.*\.jpe?g',
+ 'timestamp': 1417523507.334,
+ 'upload_date': '20141202',
+ 'duration': 9988.7,
+ 'uploader': 'GSL',
+ 'uploader_id': 414310,
+ 'view_count': int,
+ },
+ },
+ {
+ 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-',
+ 'md5': 'b72a871fe1d9f70bd7673769cdb3b925',
+ 'info_dict': {
+ 'id': '9344',
+ 'ext': 'mp4',
+ 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"',
+ 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af',
+ 'thumbnail': 're:^https?://.*\.jpe?g',
+ 'timestamp': 1410530893.320,
+ 'upload_date': '20140912',
+ 'duration': 172.385,
+ 'uploader': 'FnaticTV',
+ 'uploader_id': 272749,
+ 'view_count': int,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
+
+ title = data['title'].strip()
+ description = data['description']
+ thumbnail = data['thumbnail']
+ view_count = data['view_count']
+ uploader = data['user']['username']
+ uploader_id = data['user']['id']
+
+ stream_params = json.loads(data['stream_params'])
+
+ timestamp = float_or_none(stream_params['creationDate'], 1000)
+ duration = float_or_none(stream_params['length'], 1000)
+
+ renditions = stream_params.get('renditions') or []
+ video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
+ if video:
+ renditions.append(video)
+
+ formats = [{
+ 'url': fmt['url'],
+ 'width': fmt['frameWidth'],
+ 'height': fmt['frameHeight'],
+ 'vbr': float_or_none(fmt['encodingRate'], 1000),
+ 'filesize': fmt['size'],
+ 'vcodec': fmt['videoCodec'],
+ 'container': fmt['videoContainer'],
+ } for fmt in renditions if fmt['url']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
index de5d4faf3..98e1443ab 100644
--- a/youtube_dl/extractor/bambuser.py
+++ b/youtube_dl/extractor/bambuser.py
@@ -5,7 +5,7 @@ import json
import itertools
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
)
@@ -18,7 +18,7 @@ class BambuserIE(InfoExtractor):
_TEST = {
'url': 'http://bambuser.com/v/4050584',
# MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388
- #u'md5': 'fba8f7693e48fd4e8641b3fd5539a641',
+ # 'md5': 'fba8f7693e48fd4e8641b3fd5539a641',
'info_dict': {
'id': '4050584',
'ext': 'flv',
@@ -38,7 +38,7 @@ class BambuserIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
info_url = ('http://player-c.api.bambuser.com/getVideo.json?'
- '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
+ '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
info_json = self._download_webpage(info_url, video_id)
info = json.loads(info_json)['result']
@@ -73,10 +73,11 @@ class BambuserChannelIE(InfoExtractor):
urls = []
last_id = ''
for i in itertools.count(1):
- req_url = ('http://bambuser.com/xhr-api/index.php?username={user}'
+ req_url = (
+ 'http://bambuser.com/xhr-api/index.php?username={user}'
'&sort=created&access_mode=0%2C1%2C2&limit={count}'
'&method=broadcast&format=json&vid_older_than={last}'
- ).format(user=user, count=self._STEP, last=last_id)
+ ).format(user=user, count=self._STEP, last=last_id)
req = compat_urllib_request.Request(req_url)
# Without setting this header, we wouldn't get any result
req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index c13446665..aea0263d6 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -4,9 +4,11 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
)
@@ -83,12 +85,12 @@ class BandcampIE(InfoExtractor):
initial_url = mp3_info['url']
re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
m_url = re.match(re_url, initial_url)
- #We build the url we will use to get the final track url
+ # We build the url we will use to get the final track url
# This url is build in Bandcamp in the script download_bunde_*.js
request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))
final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
- #in the "download_url" key
+ # in the "download_url" key
final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
return {
@@ -104,26 +106,31 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+)|/?(?:$|[?#]))'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
'playlist': [
{
- 'file': '1353101989.mp3',
'md5': '39bc1eded3476e927c724321ddf116cf',
'info_dict': {
+ 'id': '1353101989',
+ 'ext': 'mp3',
'title': 'Intro',
}
},
{
- 'file': '38097443.mp3',
'md5': '1a2c32e2691474643e912cc6cd4bffaa',
'info_dict': {
+ 'id': '38097443',
+ 'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)',
}
},
],
+ 'info_dict': {
+ 'title': 'Jazz Format Mixtape vol.1',
+ },
'params': {
'playlistend': 2
},
@@ -134,6 +141,12 @@ class BandcampAlbumIE(InfoExtractor):
'title': 'Hierophany of the Open Grave',
},
'playlist_mincount': 9,
+ }, {
+ 'url': 'http://dotscale.bandcamp.com',
+ 'info_dict': {
+ 'title': 'Loom',
+ },
+ 'playlist_mincount': 7,
}]
def _real_extract(self, url):
@@ -148,7 +161,8 @@ class BandcampAlbumIE(InfoExtractor):
entries = [
self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
for t_path in tracks_paths]
- title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title')
+ title = self._search_regex(
+ r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False)
return {
'_type': 'playlist',
'id': playlist_id,
diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py
index 75e608f99..1cf48fe0d 100644
--- a/youtube_dl/extractor/bbccouk.py
+++ b/youtube_dl/extractor/bbccouk.py
@@ -1,15 +1,16 @@
from __future__ import unicode_literals
-import re
+import xml.etree.ElementTree
from .subtitles import SubtitlesInfoExtractor
from ..utils import ExtractorError
+from ..compat import compat_HTTPError
class BBCCoUkIE(SubtitlesInfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
_TESTS = [
{
@@ -17,8 +18,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
'info_dict': {
'id': 'b039d07m',
'ext': 'flv',
- 'title': 'Kaleidoscope: Leonard Cohen',
- 'description': 'md5:db4755d7a665ae72343779f7dacb402c',
+ 'title': 'Kaleidoscope, Leonard Cohen',
+ 'description': 'The Canadian poet and songwriter reflects on his musical career.',
'duration': 1740,
},
'params': {
@@ -55,6 +56,68 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
'skip_download': True,
},
'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+ },
+ {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
+ 'info_dict': {
+ 'id': 'b03k3pb7',
+ 'ext': 'flv',
+ 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
+ 'description': '2. Invasion',
+ 'duration': 3600,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
+ 'info_dict': {
+ 'id': 'b04v209v',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, The Essential New Tune Special',
+ 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
+ 'duration': 10800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
+ 'note': 'Audio',
+ 'info_dict': {
+ 'id': 'p02frcch',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
+ 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
+ 'duration': 3507,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
+ 'note': 'Video',
+ 'info_dict': {
+ 'id': 'p025c103',
+ 'ext': 'flv',
+ 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
+ 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
+ 'duration': 226,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
+ 'only_matching': True,
}
]
@@ -102,6 +165,10 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
def _extract_medias(self, media_selection):
+ error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
+ if error is not None:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
def _extract_connections(self, media):
@@ -158,54 +225,101 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
subtitles[lang] = srt
return subtitles
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- group_id = mobj.group('id')
+ def _download_media_selector(self, programme_id):
+ try:
+ media_selection = self._download_xml(
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
+ programme_id, 'Downloading media selection XML')
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8'))
+ else:
+ raise
- webpage = self._download_webpage(url, group_id, 'Downloading video page')
- if re.search(r'id="emp-error" class="notinuk">', webpage):
- raise ExtractorError('Currently BBC iPlayer TV programmes are available to play in the UK only',
- expected=True)
+ formats = []
+ subtitles = None
+
+ for media in self._extract_medias(media_selection):
+ kind = media.get('kind')
+ if kind == 'audio':
+ formats.extend(self._extract_audio(media, programme_id))
+ elif kind == 'video':
+ formats.extend(self._extract_video(media, programme_id))
+ elif kind == 'captions':
+ subtitles = self._extract_captions(media, programme_id)
+
+ return formats, subtitles
- playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
- 'Downloading playlist XML')
+ def _download_playlist(self, playlist_id):
+ try:
+ playlist = self._download_json(
+ 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
+ playlist_id, 'Downloading playlist JSON')
+
+ version = playlist.get('defaultAvailableVersion')
+ if version:
+ smp_config = version['smpConfig']
+ title = smp_config['title']
+ description = smp_config['summary']
+ for item in smp_config['items']:
+ kind = item['kind']
+ if kind != 'programme' and kind != 'radioProgramme':
+ continue
+ programme_id = item.get('vpid')
+ duration = int(item.get('duration'))
+ formats, subtitles = self._download_media_selector(programme_id)
+ return programme_id, title, description, duration, formats, subtitles
+ except ExtractorError as ee:
+ if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+ raise
+
+ # fallback to legacy playlist
+ playlist = self._download_xml(
+ 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
+ playlist_id, 'Downloading legacy playlist XML')
no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
if no_items is not None:
reason = no_items.get('reason')
if reason == 'preAvailability':
- msg = 'Episode %s is not yet available' % group_id
+ msg = 'Episode %s is not yet available' % playlist_id
elif reason == 'postAvailability':
- msg = 'Episode %s is no longer available' % group_id
+ msg = 'Episode %s is no longer available' % playlist_id
+ elif reason == 'noMedia':
+ msg = 'Episode %s is not currently available' % playlist_id
else:
- msg = 'Episode %s is not available: %s' % (group_id, reason)
+ msg = 'Episode %s is not available: %s' % (playlist_id, reason)
raise ExtractorError(msg, expected=True)
- formats = []
- subtitles = None
-
for item in self._extract_items(playlist):
kind = item.get('kind')
if kind != 'programme' and kind != 'radioProgramme':
continue
title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
-
programme_id = item.get('identifier')
duration = int(item.get('duration'))
+ formats, subtitles = self._download_media_selector(programme_id)
- media_selection = self._download_xml(
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
- programme_id, 'Downloading media selection XML')
+ return programme_id, title, description, duration, formats, subtitles
+
+ def _real_extract(self, url):
+ group_id = self._match_id(url)
- for media in self._extract_medias(media_selection):
- kind = media.get('kind')
- if kind == 'audio':
- formats.extend(self._extract_audio(media, programme_id))
- elif kind == 'video':
- formats.extend(self._extract_video(media, programme_id))
- elif kind == 'captions':
- subtitles = self._extract_captions(media, programme_id)
+ webpage = self._download_webpage(url, group_id, 'Downloading video page')
+
+ programme_id = self._search_regex(
+ r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
+ if programme_id:
+ player = self._download_json(
+ 'http://www.bbc.co.uk/iplayer/episode/%s.json' % group_id,
+ group_id)['jsConf']['player']
+ title = player['title']
+ description = player['subtitle']
+ duration = player['duration']
+ formats, subtitles = self._download_media_selector(programme_id)
+ else:
+ programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(programme_id, subtitles)
@@ -220,4 +334,4 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
'duration': duration,
'formats': formats,
'subtitles': subtitles,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index 314e37f8b..4e79fea8f 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -40,7 +40,7 @@ class BeegIE(InfoExtractor):
title = self._html_search_regex(
r'<title>([^<]+)\s*-\s*beeg\.?</title>', webpage, 'title')
-
+
description = self._html_search_regex(
r'<meta name="description" content="([^"]*)"',
webpage, 'description', fatal=False)
diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py
index 31fdc0dcc..1bdc25812 100644
--- a/youtube_dl/extractor/behindkink.py
+++ b/youtube_dl/extractor/behindkink.py
@@ -10,15 +10,15 @@ from ..utils import url_basename
class BehindKinkIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
_TEST = {
- 'url': 'http://www.behindkink.com/2014/08/14/ab1576-performers-voice-finally-heard-the-bill-is-killed/',
- 'md5': '41ad01222b8442089a55528fec43ec01',
+ 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/',
+ 'md5': '507b57d8fdcd75a41a9a7bdb7989c762',
'info_dict': {
- 'id': '36370',
+ 'id': '37127',
'ext': 'mp4',
- 'title': 'AB1576 - PERFORMERS VOICE FINALLY HEARD - THE BILL IS KILLED!',
- 'description': 'The adult industry voice was finally heard as Assembly Bill 1576 remained\xa0 in suspense today at the Senate Appropriations Hearing. AB1576 was, among other industry damaging issues, a condom mandate...',
- 'upload_date': '20140814',
- 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/08/36370_AB1576_Win.jpg',
+ 'title': 'What are you passionate about – Marley Blaze',
+ 'description': 'md5:aee8e9611b4ff70186f752975d9b94b4',
+ 'upload_date': '20141205',
+ 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg',
'age_limit': 18,
}
}
@@ -26,26 +26,19 @@ class BehindKinkIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
- year = mobj.group('year')
- month = mobj.group('month')
- day = mobj.group('day')
- upload_date = year + month + day
webpage = self._download_webpage(url, display_id)
video_url = self._search_regex(
- r"'file':\s*'([^']+)'",
- webpage, 'URL base')
-
- video_id = url_basename(video_url)
- video_id = video_id.split('_')[0]
+ r'<source src="([^"]+)"', webpage, 'video URL')
+ video_id = url_basename(video_url).split('_')[0]
+ upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day')
return {
'id': video_id,
+ 'display_id': display_id,
'url': video_url,
- 'ext': 'mp4',
'title': self._og_search_title(webpage),
- 'display_id': display_id,
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
'upload_date': upload_date,
diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py
new file mode 100644
index 000000000..d2abd4d77
--- /dev/null
+++ b/youtube_dl/extractor/bet.py
@@ -0,0 +1,107 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ xpath_text,
+ xpath_with_ns,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class BetIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
+ _TESTS = [
+ {
+ 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
+ 'info_dict': {
+ 'id': '740ab250-bb94-4a8a-8787-fe0de7c74471',
+ 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
+ 'ext': 'flv',
+ 'title': 'BET News Presents: A Conversation With President Obama',
+ 'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6',
+ 'duration': 1534,
+ 'timestamp': 1418075340,
+ 'upload_date': '20141208',
+ 'uploader': 'admin',
+ 'thumbnail': 're:(?i)^https?://.*\.jpg$',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
+ 'info_dict': {
+ 'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d',
+ 'display_id': 'justice-for-ferguson-a-community-reacts',
+ 'ext': 'flv',
+ 'title': 'Justice for Ferguson: A Community Reacts',
+ 'description': 'A BET News special.',
+ 'duration': 1696,
+ 'timestamp': 1416942360,
+ 'upload_date': '20141125',
+ 'uploader': 'admin',
+ 'thumbnail': 're:(?i)^https?://.*\.jpg$',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ media_url = compat_urllib_parse.unquote(self._search_regex(
+ [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
+ webpage, 'media URL'))
+
+ mrss = self._download_xml(media_url, display_id)
+
+ item = mrss.find('./channel/item')
+
+ NS_MAP = {
+ 'dc': 'http://purl.org/dc/elements/1.1/',
+ 'media': 'http://search.yahoo.com/mrss/',
+ 'ka': 'http://kickapps.com/karss',
+ }
+
+ title = xpath_text(item, './title', 'title')
+ description = xpath_text(
+ item, './description', 'description', fatal=False)
+
+ video_id = xpath_text(item, './guid', 'video id', fatal=False)
+
+ timestamp = parse_iso8601(xpath_text(
+ item, xpath_with_ns('./dc:date', NS_MAP),
+ 'upload date', fatal=False))
+ uploader = xpath_text(
+ item, xpath_with_ns('./dc:creator', NS_MAP),
+ 'uploader', fatal=False)
+
+ media_content = item.find(
+ xpath_with_ns('./media:content', NS_MAP))
+ duration = int_or_none(media_content.get('duration'))
+ smil_url = media_content.get('url')
+
+ thumbnail = media_content.find(
+ xpath_with_ns('./media:thumbnail', NS_MAP)).get('url')
+
+ formats = self._extract_smil_formats(smil_url, display_id)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py
new file mode 100644
index 000000000..77b562d99
--- /dev/null
+++ b/youtube_dl/extractor/bild.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class BildIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html'
+ IE_DESC = 'Bild.de'
+ _TEST = {
+ 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html',
+ 'md5': 'dd495cbd99f2413502a1713a1156ac8a',
+ 'info_dict': {
+ 'id': '38184146',
+ 'ext': 'mp4',
+ 'title': 'BILD hat sie getestet',
+ 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg',
+ 'duration': 196,
+ 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml"
+ doc = self._download_xml(xml_url, video_id)
+
+ duration = int_or_none(doc.attrib.get('duration'), scale=1000)
+
+ return {
+ 'id': video_id,
+ 'title': doc.attrib['ueberschrift'],
+ 'description': doc.attrib.get('text'),
+ 'url': doc.attrib['src'],
+ 'thumbnail': doc.attrib.get('img'),
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 0d5889f5d..75d744852 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -5,8 +5,6 @@ import re
from .common import InfoExtractor
from ..utils import (
- compat_parse_qs,
- ExtractorError,
int_or_none,
unified_strdate,
)
@@ -29,10 +27,9 @@ class BiliBiliIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
video_code = self._search_regex(
r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
@@ -55,45 +52,38 @@ class BiliBiliIE(InfoExtractor):
thumbnail = self._html_search_meta(
'thumbnailUrl', video_code, 'thumbnail', fatal=False)
- player_params = compat_parse_qs(self._html_search_regex(
- r'<iframe .*?class="player" src="https://secure\.bilibili\.(?:tv|com)/secure,([^"]+)"',
- webpage, 'player params'))
+ cid = self._search_regex(r'cid=(\d+)', webpage, 'cid')
- if 'cid' in player_params:
- cid = player_params['cid'][0]
+ lq_doc = self._download_xml(
+ 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
+ video_id,
+ note='Downloading LQ video info'
+ )
+ lq_durl = lq_doc.find('./durl')
+ formats = [{
+ 'format_id': 'lq',
+ 'quality': 1,
+ 'url': lq_durl.find('./url').text,
+ 'filesize': int_or_none(
+ lq_durl.find('./size'), get_attr='text'),
+ }]
- lq_doc = self._download_xml(
- 'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid,
- video_id,
- note='Downloading LQ video info'
- )
- lq_durl = lq_doc.find('.//durl')
- formats = [{
- 'format_id': 'lq',
- 'quality': 1,
- 'url': lq_durl.find('./url').text,
+ hq_doc = self._download_xml(
+ 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid,
+ video_id,
+ note='Downloading HQ video info',
+ fatal=False,
+ )
+ if hq_doc is not False:
+ hq_durl = hq_doc.find('./durl')
+ formats.append({
+ 'format_id': 'hq',
+ 'quality': 2,
+ 'ext': 'flv',
+ 'url': hq_durl.find('./url').text,
'filesize': int_or_none(
- lq_durl.find('./size'), get_attr='text'),
- }]
-
- hq_doc = self._download_xml(
- 'http://interface.bilibili.cn/playurl?cid=%s' % cid,
- video_id,
- note='Downloading HQ video info',
- fatal=False,
- )
- if hq_doc is not False:
- hq_durl = hq_doc.find('.//durl')
- formats.append({
- 'format_id': 'hq',
- 'quality': 2,
- 'ext': 'flv',
- 'url': hq_durl.find('./url').text,
- 'filesize': int_or_none(
- hq_durl.find('./size'), get_attr='text'),
- })
- else:
- raise ExtractorError('Unsupported player parameters: %r' % (player_params,))
+ hq_durl.find('./size'), get_attr='text'),
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index 57d17bea3..436cc5155 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -4,13 +4,17 @@ import re
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
+
+from ..compat import (
+ compat_str,
compat_urllib_request,
- unescapeHTML,
- parse_iso8601,
compat_urlparse,
+)
+from ..utils import (
clean_html,
- compat_str,
+ int_or_none,
+ parse_iso8601,
+ unescapeHTML,
)
@@ -64,20 +68,55 @@ class BlipTVIE(SubtitlesInfoExtractor):
'uploader': 'redvsblue',
'uploader_id': '792887',
}
- }
+ },
+ {
+ 'url': 'http://blip.tv/play/gbk766dkj4Yn',
+ 'md5': 'fe0a33f022d49399a241e84a8ea8b8e3',
+ 'info_dict': {
+ 'id': '1749452',
+ 'ext': 'mp4',
+ 'upload_date': '20090208',
+ 'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.',
+ 'title': 'Nostalgia Critic: Transformers',
+ 'timestamp': 1234068723,
+ 'uploader': 'NostalgiaCritic',
+ 'uploader_id': '246467',
+ }
+ },
+ {
+ # https://github.com/rg3/youtube-dl/pull/4404
+ 'note': 'Audio only',
+ 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982',
+ 'md5': '76c0a56f24e769ceaab21fbb6416a351',
+ 'info_dict': {
+ 'id': '7103299',
+ 'ext': 'flv',
+ 'title': 'Weekly Manga Recap: Kingdom',
+ 'description': 'And then Shin breaks the enemy line, and he&apos;s all like HWAH! And then he slices a guy and it&apos;s all like FWASHING! And... it&apos;s really hard to describe the best parts of this series without breaking down into sound effects, okay?',
+ 'timestamp': 1417660321,
+ 'upload_date': '20141204',
+ 'uploader': 'The Rollo T',
+ 'uploader_id': '407429',
+ 'duration': 7251,
+ 'vcodec': 'none',
+ }
+ },
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
lookup_id = mobj.group('lookup_id')
- # See https://github.com/rg3/youtube-dl/issues/857
+ # See https://github.com/rg3/youtube-dl/issues/857 and
+ # https://github.com/rg3/youtube-dl/issues/4197
if lookup_id:
- info_page = self._download_webpage(
- 'http://blip.tv/play/%s.x?p=1' % lookup_id, lookup_id, 'Resolving lookup id')
- video_id = self._search_regex(r'data-episode-id="([0-9]+)', info_page, 'video_id')
- else:
- video_id = mobj.group('id')
+ urlh = self._request_webpage(
+ 'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id')
+ url = compat_urlparse.urlparse(urlh.geturl())
+ qs = compat_urlparse.parse_qs(url.query)
+ mobj = re.match(self._VALID_URL, qs['file'][0])
+
+ video_id = mobj.group('id')
rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
@@ -113,7 +152,7 @@ class BlipTVIE(SubtitlesInfoExtractor):
msg = self._download_webpage(
url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
video_id, 'Resolving URL for %s' % role)
- real_url = compat_urlparse.parse_qs(msg)['message'][0]
+ real_url = compat_urlparse.parse_qs(msg.strip())['message'][0]
media_type = media_content.get('type')
if media_type == 'text/srt' or url.endswith('.srt'):
@@ -128,11 +167,11 @@ class BlipTVIE(SubtitlesInfoExtractor):
'url': real_url,
'format_id': role,
'format_note': media_type,
- 'vcodec': media_content.get(blip('vcodec')),
+ 'vcodec': media_content.get(blip('vcodec')) or 'none',
'acodec': media_content.get(blip('acodec')),
'filesize': media_content.get('filesize'),
- 'width': int(media_content.get('width')),
- 'height': int(media_content.get('height')),
+ 'width': int_or_none(media_content.get('width')),
+ 'height': int_or_none(media_content.get('height')),
})
self._sort_formats(formats)
@@ -160,14 +199,22 @@ class BlipTVIE(SubtitlesInfoExtractor):
# For some weird reason, blip.tv serves a video instead of subtitles
# when we request with a common UA
req = compat_urllib_request.Request(url)
- req.add_header('Youtubedl-user-agent', 'youtube-dl')
+ req.add_header('User-Agent', 'youtube-dl')
return self._download_webpage(req, None, note=False)
class BlipTVUserIE(InfoExtractor):
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$'
+ _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$'
_PAGE_SIZE = 12
IE_NAME = 'blip.tv:user'
+ _TEST = {
+ 'url': 'http://blip.tv/actone',
+ 'info_dict': {
+ 'id': 'actone',
+ 'title': 'Act One: The Series',
+ },
+ 'playlist_count': 5,
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -178,6 +225,7 @@ class BlipTVUserIE(InfoExtractor):
page = self._download_webpage(url, username, 'Downloading user page')
mobj = re.search(r'data-users-id="([^"]+)"', page)
page_base = page_base % mobj.group(1)
+ title = self._og_search_title(page)
# Download video ids using BlipTV Ajax calls. Result size per
# query is limited (currently to 12 videos) so we need to query
@@ -214,4 +262,5 @@ class BlipTVUserIE(InfoExtractor):
urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]
url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
- return [self.playlist_result(url_entries, playlist_title=username)]
+ return self.playlist_result(
+ url_entries, playlist_title=title, playlist_id=username)
diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py
new file mode 100644
index 000000000..510813f76
--- /dev/null
+++ b/youtube_dl/extractor/bpb.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BpbIE(InfoExtractor):
+ IE_DESC = 'Bundeszentrale für politische Bildung'
+ _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/'
+
+ _TEST = {
+ 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
+ 'md5': '0792086e8e2bfbac9cdf27835d5f2093',
+ 'info_dict': {
+ 'id': '297',
+ 'ext': 'mp4',
+ 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
+ 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(
+ r'<h2 class="white">(.*?)</h2>', webpage, 'title')
+ video_url = self._html_search_regex(
+ r'(http://film\.bpb\.de/player/dokument_[0-9]+\.mp4)',
+ webpage, 'video URL')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index 2e277c8c3..45ba51732 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
ExtractorError,
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py
index 2c0e5eea2..4bcc897c9 100644
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -14,7 +14,6 @@ class BreakIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
- 'md5': '33aa4ff477ecd124d18d7b5d23b87ce5',
'info_dict': {
'id': '2468056',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index ad22cbafd..003152c4e 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -6,24 +6,26 @@ import json
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..utils import (
- compat_urllib_parse,
- find_xpath_attr,
- fix_xml_ampersands,
- compat_urlparse,
+from ..compat import (
+ compat_parse_qs,
compat_str,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_parse_qs,
-
+ compat_urlparse,
+)
+from ..utils import (
determine_ext,
ExtractorError,
- unsmuggle_url,
+ find_xpath_attr,
+ fix_xml_ampersands,
unescapeHTML,
+ unsmuggle_url,
)
class BrightcoveIE(InfoExtractor):
- _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+ _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
_TESTS = [
@@ -110,6 +112,8 @@ class BrightcoveIE(InfoExtractor):
lambda m: m.group(1) + '/>', object_str)
# Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
object_str = object_str.replace('<--', '<!--')
+ # remove namespace to simplify extraction
+ object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
object_str = fix_xml_ampersands(object_str)
object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
@@ -218,7 +222,7 @@ class BrightcoveIE(InfoExtractor):
webpage = self._download_webpage(req, video_id)
error_msg = self._html_search_regex(
- r"<h1>We're sorry.</h1>\s*<p>(.*?)</p>", webpage,
+ r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage,
'error message', default=None)
if error_msg is not None:
raise ExtractorError(
@@ -260,12 +264,21 @@ class BrightcoveIE(InfoExtractor):
formats = []
for rend in renditions:
url = rend['defaultURL']
+ if not url:
+ continue
+ ext = None
if rend['remote']:
- # This type of renditions are served through akamaihd.net,
- # but they don't use f4m manifests
- url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
- ext = 'flv'
- else:
+ url_comp = compat_urllib_parse_urlparse(url)
+ if url_comp.path.endswith('.m3u8'):
+ formats.extend(
+ self._extract_m3u8_formats(url, info['id'], 'mp4'))
+ continue
+ elif 'akamaihd.net' in url_comp.netloc:
+ # This type of renditions are served through
+ # akamaihd.net, but they don't use f4m manifests
+ url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB'
+ ext = 'flv'
+ if ext is None:
ext = determine_ext(url)
size = rend.get('size')
formats.append({
diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py
new file mode 100644
index 000000000..a5d2af174
--- /dev/null
+++ b/youtube_dl/extractor/buzzfeed.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class BuzzFeedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia',
+ 'info_dict': {
+ 'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss',
+ 'title': 'This Angry Ram Destroys A Punching Bag Like A Boss',
+ 'description': 'Rambro!',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'aVCR29aE_OQ',
+ 'ext': 'mp4',
+ 'upload_date': '20141024',
+ 'uploader_id': 'Buddhanz1',
+ 'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl',
+ 'uploader': 'Buddhanz',
+ 'title': 'Angry Ram destroys a punching bag',
+ }
+ }]
+ }, {
+ 'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia',
+ 'params': {
+ 'skip_download': True, # Got enough YouTube download tests
+ },
+ 'info_dict': {
+ 'description': 're:Munchkin the Teddy Bear is back ?!',
+ 'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'mVmBL8B-In0',
+ 'ext': 'mp4',
+ 'upload_date': '20141124',
+ 'uploader_id': 'CindysMunchkin',
+ 'description': 're:© 2014 Munchkin the Shih Tzu',
+ 'uploader': 'Munchkin the Shih Tzu',
+ 'title': 're:Munchkin the Teddy Bear gets her exercise',
+ },
+ }]
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+
+ all_buckets = re.findall(
+ r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'',
+ webpage)
+
+ entries = []
+ for bd_json in all_buckets:
+ bd = json.loads(bd_json)
+ video = bd.get('video') or bd.get('progload_video')
+ if not video:
+ continue
+ entries.append(self.url_result(video['url']))
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py
index cf19b7b0c..6252be05b 100644
--- a/youtube_dl/extractor/byutv.py
+++ b/youtube_dl/extractor/byutv.py
@@ -10,12 +10,12 @@ from ..utils import ExtractorError
class BYUtvIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
_TEST = {
- 'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking',
+ 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
'info_dict': {
- 'id': 'granite-flats-talking',
+ 'id': 'studio-c-season-5-episode-5',
'ext': 'mp4',
- 'description': 'md5:4e9a7ce60f209a33eca0ac65b4918e1c',
- 'title': 'Talking',
+ 'description': 'md5:5438d33774b6bdc662f9485a340401cc',
+ 'title': 'Season 5 Episode 5',
'thumbnail': 're:^https?://.*promo.*'
},
'params': {
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 0202078b0..11d18d74a 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -5,17 +5,25 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
+ HEADRequest,
unified_strdate,
url_basename,
+ qualities,
)
class CanalplusIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
- _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
- IE_NAME = 'canalplus.fr'
+ IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv'
+ _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
+ _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s'
+ _SITE_ID_MAP = {
+ 'canalplus.fr': 'cplus',
+ 'piwiplus.fr': 'teletoon',
+ 'd8.tv': 'd8',
+ }
- _TEST = {
+ _TESTS = [{
'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
'md5': '3db39fb48b9685438ecf33a1078023e4',
'info_dict': {
@@ -25,36 +33,83 @@ class CanalplusIE(InfoExtractor):
'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
'upload_date': '20130826',
},
- }
+ }, {
+ 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
+ 'info_dict': {
+ 'id': '1108190',
+ 'ext': 'flv',
+ 'title': 'Le labyrinthe - Boing super ranger',
+ 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff',
+ 'upload_date': '20140724',
+ },
+ 'skip': 'Only works from France',
+ }, {
+ 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+ 'info_dict': {
+ 'id': '966289',
+ 'ext': 'flv',
+ 'title': 'Campagne intime - Documentaire exceptionnel',
+ 'description': 'md5:d2643b799fb190846ae09c61e59a859f',
+ 'upload_date': '20131108',
+ },
+ 'skip': 'videos get deleted after a while',
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.groupdict().get('id')
+ site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal']
+
# Beware, some subclasses do not define an id group
display_id = url_basename(mobj.group('path'))
if video_id is None:
webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, 'video id')
+ video_id = self._search_regex(
+ r'<canal:player[^>]+?videoId="(\d+)"', webpage, 'video id')
- info_url = self._VIDEO_INFO_TEMPLATE % video_id
+ info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
doc = self._download_xml(info_url, video_id, 'Downloading video XML')
video_info = [video for video in doc if video.find('ID').text == video_id][0]
media = video_info.find('MEDIA')
infos = video_info.find('INFOS')
- preferences = ['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS']
+ preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS'])
+
+ fmt_url = next(iter(media.find('VIDEOS'))).text
+ if '/geo' in fmt_url.lower():
+ response = self._request_webpage(
+ HEADRequest(fmt_url), video_id,
+ 'Checking if the video is georestricted')
+ if '/blocage' in response.geturl():
+ raise ExtractorError(
+ 'The video is not available in your country',
+ expected=True)
- formats = [
- {
- 'url': fmt.text + '?hdcore=2.11.3' if fmt.tag == 'HDS' else fmt.text,
- 'format_id': fmt.tag,
- 'ext': 'mp4' if fmt.tag == 'HLS' else 'flv',
- 'preference': preferences.index(fmt.tag) if fmt.tag in preferences else -1,
- } for fmt in media.find('VIDEOS') if fmt.text
- ]
+ formats = []
+ for fmt in media.find('VIDEOS'):
+ format_url = fmt.text
+ if not format_url:
+ continue
+ format_id = fmt.tag
+ if format_id == 'HLS':
+ hls_formats = self._extract_m3u8_formats(format_url, video_id, 'flv')
+ for fmt in hls_formats:
+ fmt['preference'] = preference(format_id)
+ formats.extend(hls_formats)
+ elif format_id == 'HDS':
+ hds_formats = self._extract_f4m_formats(format_url + '?hdcore=2.11.3', video_id)
+ for fmt in hds_formats:
+ fmt['preference'] = preference(format_id)
+ formats.extend(hds_formats)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'preference': preference(format_id),
+ })
self._sort_formats(formats)
return {
@@ -69,4 +124,4 @@ class CanalplusIE(InfoExtractor):
'like_count': int(infos.find('NB_LIKES').text),
'comment_count': int(infos.find('NB_COMMENTS').text),
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index db48dc24f..e43756ec6 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -45,4 +45,4 @@ class CBSIE(InfoExtractor):
real_id = self._search_regex(
r"video\.settings\.pid\s*=\s*'([^']+)';",
webpage, 'real video ID')
- return self.url_result(u'theplatform:%s' % real_id)
+ return self.url_result('theplatform:%s' % real_id)
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 0bce7937f..7e47960ab 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -84,4 +84,4 @@ class CBSNewsIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index 90a3dddb9..f70e090bb 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -3,55 +3,50 @@ from __future__ import unicode_literals
import re
-from .common import InfoExtractor
-from ..utils import (
+from .subtitles import SubtitlesInfoExtractor
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
compat_urllib_parse_urlparse,
+)
+from ..utils import (
ExtractorError,
+ float_or_none,
)
-class CeskaTelevizeIE(InfoExtractor):
+class CeskaTelevizeIE(SubtitlesInfoExtractor):
_VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
_TESTS = [
{
- 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
- 'info_dict': {
- 'id': '213512120230004',
- 'ext': 'flv',
- 'title': 'První republika: Španělská chřipka',
- 'duration': 3107.4,
- },
- 'params': {
- 'skip_download': True, # requires rtmpdump
- },
- 'skip': 'Works only from Czech Republic.',
- },
- {
- 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
'info_dict': {
- 'id': '20138143440',
- 'ext': 'flv',
- 'title': 'Tsatsiki, maminka a policajt',
- 'duration': 6754.1,
+ 'id': '214411058091220',
+ 'ext': 'mp4',
+ 'title': 'Hyde Park Civilizace',
+ 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 3350,
},
'params': {
- 'skip_download': True, # requires rtmpdump
+ # m3u8 download
+ 'skip_download': True,
},
- 'skip': 'Works only from Czech Republic.',
},
{
'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
'info_dict': {
'id': '14716',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'První republika: Zpěvačka z Dupárny Bobina',
- 'duration': 90,
+ 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 88.4,
},
'params': {
- 'skip_download': True, # requires rtmpdump
+ # m3u8 download
+ 'skip_download': True,
},
},
]
@@ -78,8 +73,9 @@ class CeskaTelevizeIE(InfoExtractor):
'requestSource': 'iVysilani',
}
- req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
- data=compat_urllib_parse.urlencode(data))
+ req = compat_urllib_request.Request(
+ 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
+ data=compat_urllib_parse.urlencode(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
req.add_header('x-addr', '127.0.0.1')
@@ -88,39 +84,72 @@ class CeskaTelevizeIE(InfoExtractor):
playlistpage = self._download_json(req, video_id)
- req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
+ playlist_url = playlistpage['url']
+ if playlist_url == 'error_region':
+ raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+ req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url))
req.add_header('Referer', url)
- playlist = self._download_xml(req, video_id)
-
- formats = []
- for i in playlist.find('smilRoot/body'):
- if 'AD' not in i.attrib['id']:
- base_url = i.attrib['base']
- parsedurl = compat_urllib_parse_urlparse(base_url)
- duration = i.attrib['duration']
-
- for video in i.findall('video'):
- if video.attrib['label'] != 'AD':
- format_id = video.attrib['label']
- play_path = video.attrib['src']
- vbr = int(video.attrib['system-bitrate'])
-
- formats.append({
- 'format_id': format_id,
- 'url': base_url,
- 'vbr': vbr,
- 'play_path': play_path,
- 'app': parsedurl.path[1:] + '?' + parsedurl.query,
- 'rtmp_live': True,
- 'ext': 'flv',
- })
+ playlist = self._download_json(req, video_id)
+ item = playlist['playlist'][0]
+ formats = []
+ for format_id, stream_url in item['streamUrls'].items():
+ formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4'))
self._sort_formats(formats)
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ duration = float_or_none(item.get('duration'))
+ thumbnail = item.get('previewImageUrl')
+
+ subtitles = {}
+ subs = item.get('subtitles')
+ if subs:
+ subtitles['cs'] = subs[0]['url']
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
+ subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles))
+
return {
'id': episode_id,
- 'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
- 'duration': float(duration),
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
'formats': formats,
+ 'subtitles': subtitles,
}
+
+ @staticmethod
+ def _fix_subtitles(subtitles):
+ """ Convert millisecond-based subtitles to SRT """
+ if subtitles is None:
+ return subtitles # subtitles not requested
+
+ def _msectotimecode(msec):
+ """ Helper utility to convert milliseconds to timecode """
+ components = []
+ for divider in [1000, 60, 60, 100]:
+ components.append(msec % divider)
+ msec //= divider
+ return "{3:02}:{2:02}:{1:02},{0:03}".format(*components)
+
+ def _fix_subtitle(subtitle):
+ for line in subtitle.splitlines():
+ m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line)
+ if m:
+ yield m.group(1)
+ start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
+ yield "{0} --> {1}".format(start, stop)
+ else:
+ yield line
+
+ fixed_subtitles = {}
+ for k, v in subtitles.items():
+ fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v))
+ return fixed_subtitles
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index 4f000292b..3dfc24f5b 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import ExtractorError
+
class Channel9IE(InfoExtractor):
'''
Common extractor for channel9.msdn.com.
@@ -27,11 +28,11 @@ class Channel9IE(InfoExtractor):
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576,
- 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+ 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
'session_code': 'KOS002',
'session_day': 'Day 1',
'session_room': 'Arena 1A',
- 'session_speakers': [ 'Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen' ],
+ 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
},
},
{
@@ -43,8 +44,8 @@ class Channel9IE(InfoExtractor):
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540,
- 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
- 'authors': [ 'Mike Wilmot' ],
+ 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+ 'authors': ['Mike Wilmot'],
},
}
]
@@ -83,7 +84,7 @@ class Channel9IE(InfoExtractor):
'format_id': x.group('quality'),
'format_note': x.group('note'),
'format': '%s (%s)' % (x.group('quality'), x.group('note')),
- 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
+ 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
'preference': self._known_formats.index(x.group('quality')),
'vcodec': 'none' if x.group('note') == 'Audio only' else None,
} for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
@@ -94,7 +95,7 @@ class Channel9IE(InfoExtractor):
def _extract_title(self, html):
title = self._html_search_meta('title', html, 'title')
- if title is None:
+ if title is None:
title = self._og_search_title(html)
TITLE_SUFFIX = ' (Channel 9)'
if title is not None and title.endswith(TITLE_SUFFIX):
@@ -115,7 +116,7 @@ class Channel9IE(InfoExtractor):
return self._html_search_meta('description', html, 'description')
def _extract_duration(self, html):
- m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
+ m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
def _extract_slides(self, html):
@@ -167,7 +168,7 @@ class Channel9IE(InfoExtractor):
return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
def _extract_content(self, html, content_path):
- # Look for downloadable content
+ # Look for downloadable content
formats = self._formats_from_html(html)
slides = self._extract_slides(html)
zip_ = self._extract_zip(html)
@@ -187,32 +188,33 @@ class Channel9IE(InfoExtractor):
view_count = self._extract_view_count(html)
comment_count = self._extract_comment_count(html)
- common = {'_type': 'video',
- 'id': content_path,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'avg_rating': avg_rating,
- 'rating_count': rating_count,
- 'view_count': view_count,
- 'comment_count': comment_count,
- }
+ common = {
+ '_type': 'video',
+ 'id': content_path,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'avg_rating': avg_rating,
+ 'rating_count': rating_count,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ }
result = []
if slides is not None:
d = common.copy()
- d.update({ 'title': title + '-Slides', 'url': slides })
+ d.update({'title': title + '-Slides', 'url': slides})
result.append(d)
if zip_ is not None:
d = common.copy()
- d.update({ 'title': title + '-Zip', 'url': zip_ })
+ d.update({'title': title + '-Zip', 'url': zip_})
result.append(d)
if len(formats) > 0:
d = common.copy()
- d.update({ 'title': title, 'formats': formats })
+ d.update({'title': title, 'formats': formats})
result.append(d)
return result
@@ -234,16 +236,17 @@ class Channel9IE(InfoExtractor):
if contents is None:
return contents
- session_meta = {'session_code': self._extract_session_code(html),
- 'session_day': self._extract_session_day(html),
- 'session_room': self._extract_session_room(html),
- 'session_speakers': self._extract_session_speakers(html),
- }
+ session_meta = {
+ 'session_code': self._extract_session_code(html),
+ 'session_day': self._extract_session_day(html),
+ 'session_room': self._extract_session_room(html),
+ 'session_speakers': self._extract_session_speakers(html),
+ }
for content in contents:
content.update(session_meta)
- return contents
+ return self.playlist_result(contents)
def _extract_list(self, content_path):
rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
@@ -258,16 +261,17 @@ class Channel9IE(InfoExtractor):
webpage = self._download_webpage(url, content_path, 'Downloading web page')
- page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage)
- if page_type_m is None:
- raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True)
-
- page_type = page_type_m.group('pagetype')
- if page_type == 'List': # List page, may contain list of 'item'-like objects
+ page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
+ if page_type_m is not None:
+ page_type = page_type_m.group('pagetype')
+ if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
+ return self._extract_entry_item(webpage, content_path)
+ elif page_type == 'Session': # Event session page, may contain downloadable content
+ return self._extract_session(webpage, content_path)
+ elif page_type == 'Event':
+ return self._extract_list(content_path)
+ else:
+ raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
+
+ else: # Assuming list
return self._extract_list(content_path)
- elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content
- return self._extract_entry_item(webpage, content_path)
- elif page_type == 'Session': # Event session page, may contain downloadable content
- return self._extract_session(webpage, content_path)
- else:
- raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True) \ No newline at end of file
diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py
new file mode 100644
index 000000000..0c9a24bef
--- /dev/null
+++ b/youtube_dl/extractor/cinchcast.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate,
+ xpath_text,
+)
+
+
+class CinchcastIE(InfoExtractor):
+ _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
+ _TEST = {
+ # Actual test is run in generic, look for undergroundwellness
+ 'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ doc = self._download_xml(
+ 'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
+ video_id)
+
+ item = doc.find('.//item')
+ title = xpath_text(item, './title', fatal=True)
+ date_str = xpath_text(
+ item, './{http://developer.longtailvideo.com/trac/}date')
+ upload_date = unified_strdate(date_str, day_first=False)
+ # duration is present but wrong
+ formats = []
+ formats.append({
+ 'format_id': 'main',
+ 'url': item.find(
+ './{http://search.yahoo.com/mrss/}content').attrib['url'],
+ })
+ backup_url = xpath_text(
+ item, './{http://developer.longtailvideo.com/trac/}backupContent')
+ if backup_url:
+ formats.append({
+ 'preference': 2, # seems to be more reliable
+ 'format_id': 'backup',
+ 'url': backup_url,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
deleted file mode 100644
index 496271be4..000000000
--- a/youtube_dl/extractor/cinemassacre.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- int_or_none,
-)
-
-
-class CinemassacreIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
- _TESTS = [
- {
- 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
- 'md5': 'fde81fbafaee331785f58cd6c0d46190',
- 'info_dict': {
- 'id': '19911',
- 'ext': 'mp4',
- 'upload_date': '20121110',
- 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
- 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
- },
- },
- {
- 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
- 'md5': 'd72f10cd39eac4215048f62ab477a511',
- 'info_dict': {
- 'id': '521be8ef82b16',
- 'ext': 'mp4',
- 'upload_date': '20131002',
- 'title': 'The Mummy’s Hand (1940)',
- },
- }
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
-
- webpage = self._download_webpage(url, display_id)
- video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
- mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
- if not mobj:
- raise ExtractorError('Can\'t extract embed url and video id')
- playerdata_url = mobj.group('embed_url')
- video_id = mobj.group('video_id')
-
- video_title = self._html_search_regex(
- r'<title>(?P<title>.+?)\|', webpage, 'title')
- video_description = self._html_search_regex(
- r'<div class="entry-content">(?P<description>.+?)</div>',
- webpage, 'description', flags=re.DOTALL, fatal=False)
-
- playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage')
- video_thumbnail = self._search_regex(
- r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
- sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file')
- videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url')
-
- videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
-
- formats = []
- baseurl = sd_url[:sd_url.rfind('/')+1]
- for video in videolist.findall('.//video'):
- src = video.get('src')
- if not src:
- continue
- file_ = src.partition(':')[-1]
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- bitrate = int_or_none(video.get('system-bitrate'))
- format = {
- 'url': baseurl + file_,
- 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
- }
- if width or height:
- format.update({
- 'tbr': bitrate // 1000 if bitrate else None,
- 'width': width,
- 'height': height,
- })
- else:
- format.update({
- 'abr': bitrate // 1000 if bitrate else None,
- 'vcodec': 'none',
- })
- formats.append(format)
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': video_title,
- 'formats': formats,
- 'description': video_description,
- 'upload_date': video_date,
- 'thumbnail': video_thumbnail,
- }
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index 669919a2c..a5c3cb7c6 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -24,7 +24,7 @@ class ClipfishIE(InfoExtractor):
'title': 'FIFA 14 - E3 2013 Trailer',
'duration': 82,
},
- u'skip': 'Blocked in the US'
+ 'skip': 'Blocked in the US'
}
def _real_extract(self, url):
@@ -34,7 +34,7 @@ class ClipfishIE(InfoExtractor):
info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
(video_id, int(time.time())))
doc = self._download_xml(
- info_url, video_id, note=u'Downloading info page')
+ info_url, video_id, note='Downloading info page')
title = doc.find('title').text
video_url = doc.find('filename').text
if video_url is None:
diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py
index d4227e6eb..d46592cc5 100644
--- a/youtube_dl/extractor/cliphunter.py
+++ b/youtube_dl/extractor/cliphunter.py
@@ -1,10 +1,7 @@
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import determine_ext
_translation_table = {
@@ -28,10 +25,10 @@ class CliphunterIE(InfoExtractor):
'''
_TEST = {
'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo',
- 'md5': 'a2ba71eebf523859fe527a61018f723e',
+ 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480',
'info_dict': {
'id': '1012420',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Fun Jynx Maze solo',
'thumbnail': 're:^https?://.*\.jpg$',
'age_limit': 18,
@@ -39,47 +36,37 @@ class CliphunterIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_title = self._search_regex(
r'mediaTitle = "([^"]+)"', webpage, 'title')
- pl_fiji = self._search_regex(
- r'pl_fiji = \'([^\']+)\'', webpage, 'video data')
- pl_c_qual = self._search_regex(
- r'pl_c_qual = "(.)"', webpage, 'video quality')
- video_url = _decode(pl_fiji)
- formats = [{
- 'url': video_url,
- 'format_id': 'default-%s' % pl_c_qual,
- }]
-
- qualities_json = self._search_regex(
- r'var pl_qualities\s*=\s*(.*?);\n', webpage, 'quality info')
- qualities_data = json.loads(qualities_json)
-
- for i, t in enumerate(
- re.findall(r"pl_fiji_([a-z0-9]+)\s*=\s*'([^']+')", webpage)):
- quality_id, crypted_url = t
- video_url = _decode(crypted_url)
+ fmts = {}
+ for fmt in ('mp4', 'flv'):
+ fmt_list = self._parse_json(self._search_regex(
+ r'var %sjson\s*=\s*(\[.*?\]);' % fmt, webpage, '%s formats' % fmt), video_id)
+ for f in fmt_list:
+ fmts[f['fname']] = _decode(f['sUrl'])
+
+ qualities = self._parse_json(self._search_regex(
+ r'var player_btns\s*=\s*(.*?);\n', webpage, 'quality info'), video_id)
+
+ formats = []
+ for fname, url in fmts.items():
f = {
- 'format_id': quality_id,
- 'url': video_url,
- 'quality': i,
+ 'url': url,
}
- if quality_id in qualities_data:
- qd = qualities_data[quality_id]
- m = re.match(
- r'''(?x)<b>(?P<width>[0-9]+)x(?P<height>[0-9]+)<\\/b>
- \s*\(\s*(?P<tbr>[0-9]+)\s*kb\\/s''', qd)
- if m:
- f['width'] = int(m.group('width'))
- f['height'] = int(m.group('height'))
- f['tbr'] = int(m.group('tbr'))
+ if fname in qualities:
+ qual = qualities[fname]
+ f.update({
+ 'format_id': '%s_%sp' % (determine_ext(url), qual['h']),
+ 'width': qual['w'],
+ 'height': qual['h'],
+ 'tbr': qual['br'],
+ })
formats.append(f)
+
self._sort_formats(formats)
thumbnail = self._search_regex(
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index 02a1667fa..d07d544ea 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -39,6 +39,7 @@ class ClipsyndicateIE(InfoExtractor):
transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track')
+
def find_param(name):
node = find_xpath_attr(track_doc, './/param', 'name', name)
if node is not None:
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index 386f080d2..abf8cc280 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -4,14 +4,16 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_parse_qs,
compat_urllib_parse,
- remove_end,
- HEADRequest,
compat_HTTPError,
)
+from ..utils import (
+ ExtractorError,
+ HEADRequest,
+ remove_end,
+)
class CloudyIE(InfoExtractor):
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
index 710d5009b..3145b3051 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cnet.py
@@ -2,12 +2,10 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- int_or_none,
)
@@ -15,23 +13,24 @@ class CNETIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
- 'md5': '041233212a0d06b179c87cbcca1577b8',
'info_dict': {
'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Hands-on with Microsoft Windows 8.1 Update',
'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
'thumbnail': 're:^http://.*/flmswindows8.jpg$',
- 'uploader_id': 'sarah.mitroff@cbsinteractive.com',
+ 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861',
'uploader': 'Sarah Mitroff',
+ },
+ 'params': {
+ 'skip_download': 'requires rtmpdump',
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
-
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
+
data_json = self._html_search_regex(
r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'",
webpage, 'data json')
@@ -42,37 +41,31 @@ class CNETIE(InfoExtractor):
if not vdata:
raise ExtractorError('Cannot find video data')
+ mpx_account = data['config']['players']['default']['mpx_account']
+ vid = vdata['files']['rtmp']
+ tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
+
video_id = vdata['id']
title = vdata.get('headline')
if title is None:
title = vdata.get('title')
if title is None:
raise ExtractorError('Cannot find title!')
- description = vdata.get('dek')
thumbnail = vdata.get('image', {}).get('path')
author = vdata.get('author')
if author:
uploader = '%s %s' % (author['firstName'], author['lastName'])
- uploader_id = author.get('email')
+ uploader_id = author.get('id')
else:
uploader = None
uploader_id = None
- formats = [{
- 'format_id': '%s-%s-%s' % (
- f['type'], f['format'],
- int_or_none(f.get('bitrate'), 1000, default='')),
- 'url': f['uri'],
- 'tbr': int_or_none(f.get('bitrate'), 1000),
- } for f in vdata['files']['data']]
- self._sort_formats(formats)
-
return {
+ '_type': 'url_transparent',
+ 'url': tp_link,
'id': video_id,
'display_id': display_id,
'title': title,
- 'formats': formats,
- 'description': description,
'uploader': uploader,
'uploader_id': uploader_id,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 78877b1cf..90ea07438 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -11,36 +11,47 @@ from ..utils import (
class CNNIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/
- (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn(-ap)?|(?=&)))'''
+ _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
+ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))'''
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
- 'file': 'sports_2013_06_09_nadal-1-on-1.cnn.mp4',
'md5': '3e6121ea48df7e2259fe73a0628605c4',
'info_dict': {
+ 'id': 'sports/2013/06/09/nadal-1-on-1.cnn',
+ 'ext': 'mp4',
'title': 'Nadal wins 8th French Open title',
'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
'duration': 135,
'upload_date': '20130609',
},
- },
- {
+ }, {
"url": "http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29",
- "file": "us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4",
"md5": "b5cc60c60a3477d185af8f19a2a26f4e",
"info_dict": {
+ 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
+ 'ext': 'mp4',
"title": "Student's epic speech stuns new freshmen",
"description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
"upload_date": "20130821",
}
+ }, {
+ 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html',
+ 'md5': 'f14d02ebd264df951feb2400e2c25a1b',
+ 'info_dict': {
+ 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln',
+ 'ext': 'mp4',
+ 'title': 'Nashville Ep. 1: Hand crafted skateboards',
+ 'description': 'md5:e7223a503315c9f150acac52e76de086',
+ 'upload_date': '20141222',
+ }
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
path = mobj.group('path')
page_title = mobj.group('title')
- info_url = 'http://cnn.com/video/data/3.0/%s/index.xml' % path
+ info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path
info = self._download_xml(info_url, page_title)
formats = []
@@ -126,3 +137,28 @@ class CNNBlogsIE(InfoExtractor):
'url': cnn_url,
'ie_key': CNNIE.ie_key(),
}
+
+
+class CNNArticleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
+ _TEST = {
+ 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
+ 'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
+ 'info_dict': {
+ 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
+ 'ext': 'mp4',
+ 'title': 'Obama: Cyberattack not an act of war',
+ 'description': 'md5:51ce6750450603795cad0cdfbd7d05c5',
+ 'upload_date': '20141221',
+ },
+ 'add_ie': ['CNN'],
+ }
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, url_basename(url))
+ cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
+ return {
+ '_type': 'url',
+ 'url': 'http://cnn.com/video/?/video/' + cnn_url,
+ 'ie_key': CNNIE.ie_key(),
+ }
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
index 6f866e7fc..002b24037 100644
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -10,47 +10,46 @@ from ..utils import int_or_none
class CollegeHumorIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
- _TESTS = [{
- 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
- 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
- 'info_dict': {
- 'id': '6902724',
- 'ext': 'mp4',
- 'title': 'Comic-Con Cosplay Catastrophe',
- 'description': "Fans get creative this year at San Diego. Too creative. And yes, that's really Joss Whedon.",
- 'age_limit': 13,
- 'duration': 187,
+ _TESTS = [
+ {
+ 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
+ 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
+ 'info_dict': {
+ 'id': '6902724',
+ 'ext': 'mp4',
+ 'title': 'Comic-Con Cosplay Catastrophe',
+ 'description': "Fans get creative this year at San Diego. Too creative. And yes, that's really Joss Whedon.",
+ 'age_limit': 13,
+ 'duration': 187,
+ },
+ }, {
+ 'url': 'http://www.collegehumor.com/video/3505939/font-conference',
+ 'md5': '72fa701d8ef38664a4dbb9e2ab721816',
+ 'info_dict': {
+ 'id': '3505939',
+ 'ext': 'mp4',
+ 'title': 'Font Conference',
+ 'description': "This video wasn't long enough, so we made it double-spaced.",
+ 'age_limit': 10,
+ 'duration': 179,
+ },
+ }, {
+ # embedded youtube video
+ 'url': 'http://www.collegehumor.com/embed/6950306',
+ 'info_dict': {
+ 'id': 'Z-bao9fg6Yc',
+ 'ext': 'mp4',
+ 'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
+ 'uploader': 'Mark Dice',
+ 'uploader_id': 'MarkDice',
+ 'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
+ 'upload_date': '20140127',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Youtube'],
},
- },
- {
- 'url': 'http://www.collegehumor.com/video/3505939/font-conference',
- 'md5': '72fa701d8ef38664a4dbb9e2ab721816',
- 'info_dict': {
- 'id': '3505939',
- 'ext': 'mp4',
- 'title': 'Font Conference',
- 'description': "This video wasn't long enough, so we made it double-spaced.",
- 'age_limit': 10,
- 'duration': 179,
- },
- },
- # embedded youtube video
- {
- 'url': 'http://www.collegehumor.com/embed/6950306',
- 'info_dict': {
- 'id': 'Z-bao9fg6Yc',
- 'ext': 'mp4',
- 'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
- 'uploader': 'Mark Dice',
- 'uploader_id': 'MarkDice',
- 'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
- 'upload_date': '20140127',
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': ['Youtube'],
- },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py
new file mode 100644
index 000000000..fedd48490
--- /dev/null
+++ b/youtube_dl/extractor/collegerama.py
@@ -0,0 +1,92 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from ..utils import (
+ float_or_none,
+ int_or_none,
+)
+
+
+class CollegeRamaIE(InfoExtractor):
+ _VALID_URL = r'https?://collegerama\.tudelft\.nl/Mediasite/Play/(?P<id>[\da-f]+)'
+ _TESTS = [
+ {
+ 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d',
+ 'md5': '481fda1c11f67588c0d9d8fbdced4e39',
+ 'info_dict': {
+ 'id': '585a43626e544bdd97aeb71a0ec907a01d',
+ 'ext': 'mp4',
+ 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',
+ 'description': '',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 7713.088,
+ 'timestamp': 1413309600,
+ 'upload_date': '20141014',
+ },
+ },
+ {
+ 'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4',
+ 'md5': 'ef1fdded95bdf19b12c5999949419c92',
+ 'info_dict': {
+ 'id': '86a9ea9f53e149079fbdb4202b521ed21d',
+ 'ext': 'wmv',
+ 'title': '64ste Vakantiecursus: Afvalwater',
+ 'description': 'md5:7fd774865cc69d972f542b157c328305',
+ 'duration': 10853,
+ 'timestamp': 1326446400,
+ 'upload_date': '20120113',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ player_options_request = {
+ "getPlayerOptionsRequest": {
+ "ResourceId": video_id,
+ "QueryString": "",
+ }
+ }
+
+ request = compat_urllib_request.Request(
+ 'http://collegerama.tudelft.nl/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions',
+ json.dumps(player_options_request))
+ request.add_header('Content-Type', 'application/json')
+
+ player_options = self._download_json(request, video_id)
+
+ presentation = player_options['d']['Presentation']
+ title = presentation['Title']
+ description = presentation.get('Description')
+ thumbnail = None
+ duration = float_or_none(presentation.get('Duration'), 1000)
+ timestamp = int_or_none(presentation.get('UnixTime'), 1000)
+
+ formats = []
+ for stream in presentation['Streams']:
+ for video in stream['VideoUrls']:
+ thumbnail_url = stream.get('ThumbnailUrl')
+ if thumbnail_url:
+ thumbnail = 'http://collegerama.tudelft.nl' + thumbnail_url
+ format_id = video['MediaType']
+ if format_id == 'SS':
+ continue
+ formats.append({
+ 'url': video['Location'],
+ 'format_id': format_id,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py
new file mode 100644
index 000000000..9c25b2223
--- /dev/null
+++ b/youtube_dl/extractor/comcarcoff.py
@@ -0,0 +1,57 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class ComCarCoffIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)'
+ _TESTS = [{
+ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/',
+ 'info_dict': {
+ 'id': 'miranda-sings-happy-thanksgiving-miranda',
+ 'ext': 'mp4',
+ 'upload_date': '20141127',
+ 'timestamp': 1417107600,
+ 'title': 'Happy Thanksgiving Miranda',
+ 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.',
+ 'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg',
+ },
+ 'params': {
+ 'skip_download': 'requires ffmpeg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ if not display_id:
+ display_id = 'comediansincarsgettingcoffee.com'
+ webpage = self._download_webpage(url, display_id)
+
+ full_data = json.loads(self._search_regex(
+ r'<script type="application/json" id="videoData">(?P<json>.+?)</script>',
+ webpage, 'full data json'))
+
+ video_id = full_data['activeVideo']['video']
+ video_data = full_data['videos'][video_id]
+ thumbnails = [{
+ 'url': video_data['images']['thumb'],
+ }, {
+ 'url': video_data['images']['poster'],
+ }]
+ formats = self._extract_m3u8_formats(
+ video_data['mediaUrl'], video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': video_data['title'],
+ 'description': video_data.get('description'),
+ 'timestamp': parse_iso8601(video_data.get('pubDate')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))),
+ }
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 035046120..15ca361f0 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -2,11 +2,12 @@ from __future__ import unicode_literals
import re
-from .common import InfoExtractor
from .mtv import MTVServicesInfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
float_or_none,
unified_strdate,
@@ -31,14 +32,14 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
}
-class ComedyCentralShowsIE(InfoExtractor):
+class ComedyCentralShowsIE(MTVServicesInfoExtractor):
IE_DESC = 'The Daily Show / The Colbert Report'
- # urls can be abbreviations like :thedailyshow or :colbert
+ # urls can be abbreviations like :thedailyshow
# urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
- _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
+ _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow)
|https?://(:www\.)?
(?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
@@ -49,7 +50,7 @@ class ComedyCentralShowsIE(InfoExtractor):
)|
(?P<interview>
extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
- (?:[?#].*|$)'''
+ '''
_TESTS = [{
'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
@@ -83,6 +84,9 @@ class ComedyCentralShowsIE(InfoExtractor):
'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights',
'only_matching': True,
}, {
+ 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo',
+ 'only_matching': True,
+ }, {
'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food',
'only_matching': True,
}, {
@@ -109,18 +113,8 @@ class ComedyCentralShowsIE(InfoExtractor):
'400': (384, 216),
}
- @staticmethod
- def _transform_rtmp_url(rtmp_video_url):
- m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url)
- if not m:
- raise ExtractorError('Cannot transform RTMP url')
- base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
- return base + m.group('finalid')
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
+ mobj = re.match(self._VALID_URL, url)
if mobj.group('shortname'):
if mobj.group('shortname') in ('tds', 'thedailyshow'):
@@ -212,9 +206,6 @@ class ComedyCentralShowsIE(InfoExtractor):
'ext': self._video_extensions.get(format, 'mp4'),
'height': h,
'width': w,
-
- 'format_note': 'HTTP 400 at the moment (patches welcome!)',
- 'preference': -100,
})
formats.append({
'format_id': 'rtmp-%s' % format,
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index e8366f7f9..7b7a832dc 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -12,13 +12,16 @@ import sys
import time
import xml.etree.ElementTree
-from ..utils import (
+from ..compat import (
+ compat_cookiejar,
compat_http_client,
compat_urllib_error,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
-
+)
+from ..utils import (
+ age_restricted,
clean_html,
compiled_regex_type,
ExtractorError,
@@ -38,11 +41,15 @@ class InfoExtractor(object):
information about the video (or videos) the URL refers to. This
information includes the real video URL, the video title, author and
others. The information is stored in a dictionary which is then
- passed to the FileDownloader. The FileDownloader processes this
+ passed to the YoutubeDL. The YoutubeDL processes this
information possibly downloading the video to the file system, among
other possible outcomes.
- The dictionaries must include the following fields:
+ The type field determines the the type of the result.
+ By far the most common value (and the default if _type is missing) is
+ "video", which indicates a single video.
+
+ For a video, the dictionaries must include the following fields:
id: Video identifier.
title: Video title, unescaped.
@@ -72,6 +79,7 @@ class InfoExtractor(object):
* acodec Name of the audio codec in use
* asr Audio sampling rate in Hertz
* vbr Average video bitrate in KBit/s
+ * fps Frame rate
* vcodec Name of the video codec in use
* container Name of the container format
* filesize The number of bytes, if known in advance
@@ -85,16 +93,29 @@ class InfoExtractor(object):
by this field, regardless of all other values.
-1 for default (order by other properties),
-2 or smaller for less than default.
+ < -1000 to hide the format (if there is
+ another one which is strictly better)
+ * language_preference Is this in the correct requested
+ language?
+ 10 if it's what the URL is about,
+ -1 for default (don't know),
+ -10 otherwise, other values reserved for now.
* quality Order number of the video quality of this
format, irrespective of the file format.
-1 for default (order by other properties),
-2 or smaller for less than default.
- * http_referer HTTP Referer header value to set.
+ * source_preference Order number for this video source
+ (quality takes higher priority)
+ -1 for default (order by other properties),
+ -2 or smaller for less than default.
* http_method HTTP method to use for the download.
* http_headers A dictionary of additional HTTP headers
to add to the request.
* http_post_data Additional data to send with a POST
request.
+ * stretched_ratio If given and not 1, indicates that the
+ video's pixels are not square.
+ width : height ratio as float.
url: Final video URL.
ext: Video filename extension.
format: The video format, defaults to ext (used for --get-format)
@@ -102,18 +123,21 @@ class InfoExtractor(object):
The following fields are optional:
+ alt_title: A secondary title of the video.
display_id An alternative identifier for the video, not necessarily
unique, but available before title. Typically, id is
something like "4234987", title "Dancing naked mole rats",
and display_id "dancing-naked-mole-rats"
thumbnails: A list of dictionaries, with the following entries:
+ * "id" (optional, string) - Thumbnail format ID
* "url"
+ * "preference" (optional, int) - quality of the image
* "width" (optional, int)
* "height" (optional, int)
* "resolution" (optional, string "{width}x{height"},
deprecated)
thumbnail: Full URL to a video thumbnail image.
- description: One-line video description.
+ description: Full video description.
uploader: Full name of the video uploader.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
@@ -127,6 +151,17 @@ class InfoExtractor(object):
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
comment_count: Number of comments on the video
+ comments: A list of comments, each with one or more of the following
+ properties (all but one of text or html optional):
+ * "author" - human-readable name of the comment author
+ * "author_id" - user ID of the comment author
+ * "id" - Comment ID
+ * "html" - Comment as HTML
+ * "text" - Plain text of the comment
+ * "timestamp" - UNIX timestamp of comment
+ * "parent" - ID of the comment this one is replying to.
+ Set to "root" to indicate that this is a
+ comment to the original video.
age_limit: Age restriction for the video, as an integer (years)
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
@@ -140,6 +175,39 @@ class InfoExtractor(object):
Unless mentioned otherwise, None is equivalent to absence of information.
+
+ _type "playlist" indicates multiple videos.
+ There must be a key "entries", which is a list, an iterable, or a PagedList
+ object, each element of which is a valid dictionary by this specification.
+
+ Additionally, playlists can have "title" and "id" attributes with the same
+ semantics as videos (see above).
+
+
+ _type "multi_video" indicates that there are multiple videos that
+ form a single show, for examples multiple acts of an opera or TV episode.
+ It must have an entries key like a playlist and contain all the keys
+ required for a video at the same time.
+
+
+ _type "url" indicates that the video must be extracted from another
+ location, possibly by a different extractor. Its only required key is:
+ "url" - the next URL to extract.
+ The key "ie_key" can be set to the class name (minus the trailing "IE",
+ e.g. "Youtube") if the extractor class is known in advance.
+ Additionally, the dictionary may have any properties of the resolved entity
+ known in advance, for example "title" if the title of the referred video is
+ known ahead of time.
+
+
+ _type "url_transparent" entities have the same specification as "url", but
+ indicate that the given additional information is more precise than the one
+ associated with the resolved URL.
+ This is useful when a site employs a video service that hosts the video and
+ its technical metadata, but that video service does not embed a useful
+ title, description etc.
+
+
Subclasses of this one should re-define the _real_initialize() and
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
@@ -238,7 +306,6 @@ class InfoExtractor(object):
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns a tuple (page content as string, URL handle) """
-
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
@@ -247,8 +314,14 @@ class InfoExtractor(object):
if urlh is False:
assert not fatal
return False
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+ return (content, urlh)
+
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
+ if prefix is not None:
+ webpage_bytes = prefix + webpage_bytes
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
@@ -305,11 +378,21 @@ class InfoExtractor(object):
msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
- return (content, urlh)
+ return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
""" Returns the data of the page as a string """
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+ success = False
+ try_count = 0
+ while success is False:
+ try:
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+ success = True
+ except compat_http_client.IncompleteRead as e:
+ try_count += 1
+ if try_count >= tries:
+ raise e
+ self._sleep(timeout, video_id)
if res is False:
return res
else:
@@ -337,6 +420,10 @@ class InfoExtractor(object):
url_or_request, video_id, note, errnote, fatal=fatal)
if (not fatal) and json_string is False:
return None
+ return self._parse_json(
+ json_string, video_id, transform_source=transform_source, fatal=fatal)
+
+ def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
if transform_source:
json_string = transform_source(json_string)
try:
@@ -373,19 +460,20 @@ class InfoExtractor(object):
"""Report attempt to log in."""
self.to_screen('Logging in')
- #Methods for following #608
+ # Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None):
"""Returns a url that points to a page that should be processed"""
- #TODO: ie should be the class used for getting the info
+ # TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
if video_id is not None:
video_info['id'] = video_id
return video_info
+
@staticmethod
- def playlist_result(entries, playlist_id=None, playlist_title=None):
+ def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
"""Returns a playlist"""
video_info = {'_type': 'playlist',
'entries': entries}
@@ -393,9 +481,11 @@ class InfoExtractor(object):
video_info['id'] = playlist_id
if playlist_title:
video_info['title'] = playlist_title
+ if playlist_description:
+ video_info['description'] = playlist_description
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -416,22 +506,25 @@ class InfoExtractor(object):
_name = name
if mobj:
- # return the first matching group
- return next(g for g in mobj.groups() if g is not None)
+ if group is None:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ else:
+ return mobj.group(group)
elif default is not _NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
self._downloader.report_warning('unable to extract %s; '
- 'please report this issue on http://yt-dl.org/bug' % _name)
+ 'please report this issue on http://yt-dl.org/bug' % _name)
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
- res = self._search_regex(pattern, string, name, default, fatal, flags)
+ res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
@@ -464,7 +557,7 @@ class InfoExtractor(object):
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
-
+
return (username, password)
def _get_tfa_info(self):
@@ -524,10 +617,10 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?ix)<meta
- (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
- [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=fatal, **kwargs)
+ r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
@@ -558,7 +651,7 @@ class InfoExtractor(object):
def _twitter_search_player(self, html):
return self._html_search_meta('twitter:player', html,
- 'twitter card player')
+ 'twitter card player')
def _sort_formats(self, formats):
if not formats:
@@ -603,6 +696,7 @@ class InfoExtractor(object):
return (
preference,
+ f.get('language_preference') if f.get('language_preference') is not None else -1,
f.get('quality') if f.get('quality') is not None else -1,
f.get('height') if f.get('height') is not None else -1,
f.get('width') if f.get('width') is not None else -1,
@@ -611,14 +705,16 @@ class InfoExtractor(object):
f.get('vbr') if f.get('vbr') is not None else -1,
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
+ f.get('fps') if f.get('fps') is not None else -1,
f.get('filesize') if f.get('filesize') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
+ f.get('source_preference') if f.get('source_preference') is not None else -1,
f.get('format_id'),
)
formats.sort(key=_formats_key)
def http_scheme(self):
- """ Either "https:" or "https:", depending on the user's preferences """
+ """ Either "http:" or "https:", depending on the user's preferences """
return (
'http:'
if self._downloader.params.get('prefer_insecure', False)
@@ -647,8 +743,14 @@ class InfoExtractor(object):
'Unable to download f4m manifest')
formats = []
+ manifest_version = '1.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
+ if not media_nodes:
+ manifest_version = '2.0'
+ media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
for i, media_el in enumerate(media_nodes):
+ if manifest_version == '2.0':
+ manifest_url = '/'.join(manifest_url.split('/')[:-1]) + '/' + media_el.attrib.get('href')
tbr = int_or_none(media_el.attrib.get('bitrate'))
format_id = 'f4m-%d' % (i if tbr is None else tbr)
formats.append({
@@ -681,7 +783,10 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- m3u8_doc = self._download_webpage(m3u8_url, video_id)
+ m3u8_doc = self._download_webpage(
+ m3u8_url, video_id,
+ note='Downloading m3u8 information',
+ errnote='Failed to download m3u8 information')
last_info = None
kv_rex = re.compile(
r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
@@ -727,6 +832,49 @@ class InfoExtractor(object):
self._sort_formats(formats)
return formats
+ # TODO: improve extraction
+ def _extract_smil_formats(self, smil_url, video_id):
+ smil = self._download_xml(
+ smil_url, video_id, 'Downloading SMIL file',
+ 'Unable to download SMIL file')
+
+ base = smil.find('./head/meta').get('base')
+
+ formats = []
+ rtmp_count = 0
+ for video in smil.findall('./body/switch/video'):
+ src = video.get('src')
+ if not src:
+ continue
+ bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ proto = video.get('proto')
+ if not proto:
+ if base:
+ if base.startswith('rtmp'):
+ proto = 'rtmp'
+ elif base.startswith('http'):
+ proto = 'http'
+ ext = video.get('ext')
+ if proto == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(src, video_id, ext))
+ elif proto == 'rtmp':
+ rtmp_count += 1
+ streamer = video.get('streamer') or base
+ formats.append({
+ 'url': streamer,
+ 'play_path': src,
+ 'ext': 'flv',
+ 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ return formats
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
@@ -755,6 +903,41 @@ class InfoExtractor(object):
self._downloader.report_warning(msg)
return res
+ def _set_cookie(self, domain, name, value, expire_time=None):
+ cookie = compat_cookiejar.Cookie(
+ 0, name, value, None, None, domain, None,
+ None, '/', True, False, expire_time, '', None, None, None)
+ self._downloader.cookiejar.set_cookie(cookie)
+
+ def get_testcases(self, include_onlymatching=False):
+ t = getattr(self, '_TEST', None)
+ if t:
+ assert not hasattr(self, '_TESTS'), \
+ '%s has _TEST and _TESTS' % type(self).__name__
+ tests = [t]
+ else:
+ tests = getattr(self, '_TESTS', [])
+ for t in tests:
+ if not include_onlymatching and t.get('only_matching', False):
+ continue
+ t['name'] = type(self).__name__[:-len('IE')]
+ yield t
+
+ def is_suitable(self, age_limit):
+ """ Test whether the extractor is generally suitable for the given
+ age limit (i.e. pornographic sites are not, all others usually are) """
+
+ any_restricted = False
+ for tc in self.get_testcases(include_onlymatching=False):
+ if 'playlist' in tc:
+ tc = tc['playlist'][0]
+ is_restricted = age_restricted(
+ tc.get('info_dict', {}).get('age_limit'), age_limit)
+ if not is_restricted:
+ return True
+ any_restricted = any_restricted or is_restricted
+ return not any_restricted
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py
new file mode 100644
index 000000000..75c06903f
--- /dev/null
+++ b/youtube_dl/extractor/commonmistakes.py
@@ -0,0 +1,29 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class CommonMistakesIE(InfoExtractor):
+ IE_DESC = False # Do not list
+ _VALID_URL = r'''(?x)
+ (?:url|URL)
+ '''
+
+ _TESTS = [{
+ 'url': 'url',
+ 'only_matching': True,
+ }, {
+ 'url': 'URL',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ msg = (
+ 'You\'ve asked youtube-dl to download the URL "%s". '
+ 'That doesn\'t make any sense. '
+ 'Simply remove the parameter in your command or configuration.'
+ ) % url
+ if self._downloader.params.get('verbose'):
+ msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.'
+ raise ExtractorError(msg, expected=True)
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index 7a7e79360..3db4db4e4 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -5,12 +5,14 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
- orderedSet,
compat_urllib_parse_urlparse,
compat_urlparse,
)
+from ..utils import (
+ orderedSet,
+)
class CondeNastIE(InfoExtractor):
diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py
index 74b880ffc..cf763ee7e 100644
--- a/youtube_dl/extractor/cracked.py
+++ b/youtube_dl/extractor/cracked.py
@@ -54,7 +54,7 @@ class CrackedIE(InfoExtractor):
return {
'id': video_id,
- 'url':video_url,
+ 'url': video_url,
'title': title,
'description': description,
'timestamp': timestamp,
@@ -62,4 +62,4 @@ class CrackedIE(InfoExtractor):
'comment_count': comment_count,
'height': height,
'width': width,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index f99888ecc..1680f532f 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -10,27 +10,28 @@ import xml.etree.ElementTree
from hashlib import sha1
from math import pow, sqrt, floor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
bytes_to_intlist,
intlist_to_bytes,
unified_strdate,
- clean_html,
urlencode_postdata,
)
from ..aes import (
aes_cbc_decrypt,
inc,
)
+from .common import InfoExtractor
class CrunchyrollIE(SubtitlesInfoExtractor):
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
- _TEST = {
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+ _TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
- #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
'info_dict': {
'id': '645513',
'ext': 'flv',
@@ -39,12 +40,16 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
'uploader': 'Yomiuri Telecasting Corporation (YTV)',
'upload_date': '20131013',
+ 'url': 're:(?!.*&amp)',
},
'params': {
# rtmp
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
+ 'only_matching': True,
+ }]
_FORMAT_IDS = {
'360': ('60', '106'),
@@ -68,11 +73,9 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self._download_webpage(login_request, None, False, 'Wrong login info')
-
def _real_initialize(self):
self._login()
-
def _decrypt_subtitles(self, data, iv, id):
data = bytes_to_intlist(data)
iv = bytes_to_intlist(iv)
@@ -98,8 +101,10 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
return shaHash + [0] * 12
key = obfuscate_key(id)
+
class Counter:
__value = iv
+
def next_value(self):
temp = self.__value
self.__value = inc(self.__value)
@@ -107,19 +112,17 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
return zlib.decompress(decrypted_data)
- def _convert_subtitles_to_srt(self, subtitles):
+ def _convert_subtitles_to_srt(self, sub_root):
output = ''
- for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
- start = start.replace('.', ',')
- end = end.replace('.', ',')
- text = clean_html(text)
- text = text.replace('\\N', '\n')
- if not text:
- continue
+
+ for i, event in enumerate(sub_root.findall('./events/event'), 1):
+ start = event.attrib['start'].replace('.', ',')
+ end = event.attrib['end'].replace('.', ',')
+ text = event.attrib['text'].replace('\\N', '\n')
output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
return output
- def _convert_subtitles_to_ass(self, subtitles):
+ def _convert_subtitles_to_ass(self, sub_root):
output = ''
def ass_bool(strvalue):
@@ -128,10 +131,6 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
assvalue = '-1'
return assvalue
- sub_root = xml.etree.ElementTree.fromstring(subtitles)
- if not sub_root:
- return output
-
output = '[Script Info]\n'
output += 'Title: %s\n' % sub_root.attrib["title"]
output += 'ScriptType: v4.00+\n'
@@ -188,7 +187,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output
- def _real_extract(self,url):
+ def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
@@ -229,20 +228,22 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
formats = []
- for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage):
+ for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
stream_quality, stream_format = self._FORMAT_IDS[fmt]
- video_format = fmt+'p'
+ video_format = fmt + 'p'
streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
# urlencode doesn't work!
- streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format
+ streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format
streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
- streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format)
- video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url')
- video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path')
+ streamdata = self._download_xml(
+ streamdata_req, video_id,
+ note='Downloading media info for %s' % video_format)
+ video_url = streamdata.find('.//host').text
+ video_play_path = streamdata.find('.//file').text
formats.append({
'url': video_url,
- 'play_path': video_play_path,
+ 'play_path': video_play_path,
'ext': 'flv',
'format': video_format,
'format_id': video_format,
@@ -251,8 +252,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
subtitles = {}
sub_format = self._downloader.params.get('subtitlesformat', 'srt')
for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
- sub_page = self._download_webpage('http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\
- video_id, note='Downloading subtitles for '+sub_name)
+ sub_page = self._download_webpage(
+ 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
+ video_id, note='Downloading subtitles for ' + sub_name)
id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
@@ -266,22 +268,60 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
+ sub_root = xml.etree.ElementTree.fromstring(subtitle)
if sub_format == 'ass':
- subtitles[lang_code] = self._convert_subtitles_to_ass(subtitle)
+ subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root)
else:
- subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
+ subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root)
if self._downloader.params.get('listsubtitles', False):
self._list_available_subtitles(video_id, subtitles)
return
return {
- 'id': video_id,
- 'title': video_title,
+ 'id': video_id,
+ 'title': video_title,
'description': video_description,
- 'thumbnail': video_thumbnail,
- 'uploader': video_uploader,
+ 'thumbnail': video_thumbnail,
+ 'uploader': video_uploader,
'upload_date': video_upload_date,
- 'subtitles': subtitles,
- 'formats': formats,
+ 'subtitles': subtitles,
+ 'formats': formats,
+ }
+
+
+class CrunchyrollShowPlaylistIE(InfoExtractor):
+ IE_NAME = "crunchyroll:playlist"
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$'
+
+ _TESTS = [{
+ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
+ 'info_dict': {
+ 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
+ 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
+ },
+ 'playlist_count': 13,
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, show_id)
+ title = self._html_search_regex(
+ r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
+ webpage, 'title')
+ episode_paths = re.findall(
+ r'(?s)<li id="showview_videos_media_[0-9]+"[^>]+>.*?<a href="([^"]+)"',
+ webpage)
+ entries = [
+ self.url_result('http://www.crunchyroll.com' + ep, 'Crunchyroll')
+ for ep in episode_paths
+ ]
+ entries.reverse()
+
+ return {
+ '_type': 'playlist',
+ 'id': show_id,
+ 'title': title,
+ 'entries': entries,
}
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index 541106684..955119d40 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -27,7 +27,6 @@ class CSpanIE(InfoExtractor):
'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
# For whatever reason, the served video alternates between
# two different ones
- #'md5': 'dbb0f047376d457f2ab8b3929cbb2d0c',
'info_dict': {
'id': '340723',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py
deleted file mode 100644
index 6b26ff2e3..000000000
--- a/youtube_dl/extractor/d8.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-from .canalplus import CanalplusIE
-
-
-class D8IE(CanalplusIE):
- _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P<path>.*)'
- _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s'
- IE_NAME = 'd8.tv'
-
- _TEST = {
- 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
- 'file': '966289.flv',
- 'info_dict': {
- 'title': 'Campagne intime - Documentaire exceptionnel',
- 'description': 'md5:d2643b799fb190846ae09c61e59a859f',
- 'upload_date': '20131108',
- },
- 'params': {
- # rtmp
- 'skip_download': True,
- },
- 'skip': 'videos get deleted after a while',
- }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index dbcf5d6a7..cf5841a7c 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -1,4 +1,4 @@
-#coding: utf-8
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -8,16 +8,19 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
- compat_urllib_request,
+from ..compat import (
compat_str,
+ compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
orderedSet,
str_to_int,
- int_or_none,
- ExtractorError,
unescapeHTML,
)
+
class DailymotionBaseInfoExtractor(InfoExtractor):
@staticmethod
def _build_request(url):
@@ -27,6 +30,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
request.add_header('Cookie', 'ff=off')
return request
+
class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
"""Information Extractor for Dailymotion"""
@@ -94,7 +98,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
# It may just embed a vevo video:
m_vevo = re.search(
- r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?videoId=(?P<id>[\w]*)',
+ r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)',
webpage)
if m_vevo is not None:
vevo_id = m_vevo.group('id')
@@ -112,7 +116,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
embed_page = self._download_webpage(embed_url, video_id,
'Downloading embed page')
info = self._search_regex(r'var info = ({.*?}),$', embed_page,
- 'video info', flags=re.MULTILINE)
+ 'video info', flags=re.MULTILINE)
info = json.loads(info)
if info.get('error') is not None:
msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
@@ -206,7 +210,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break
return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
- for video_id in orderedSet(video_ids)]
+ for video_id in orderedSet(video_ids)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index 45d66e2e6..934da765e 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -5,7 +5,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -38,7 +38,7 @@ class DaumIE(InfoExtractor):
canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
webpage = self._download_webpage(canonical_url, video_id)
full_id = self._search_regex(
- r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
+ r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']',
webpage, 'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
info = self._download_xml(
diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py
index 1d3e2ff08..212217625 100644
--- a/youtube_dl/extractor/dbtv.py
+++ b/youtube_dl/extractor/dbtv.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
float_or_none,
int_or_none,
@@ -61,7 +62,7 @@ class DBTVIE(InfoExtractor):
self._sort_formats(formats)
return {
- 'id': video['id'],
+ 'id': compat_str(video['id']),
'display_id': display_id,
'title': video['title'],
'description': clean_html(video['desc']),
diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py
index c5529f8d4..5e50c63d9 100644
--- a/youtube_dl/extractor/defense.py
+++ b/youtube_dl/extractor/defense.py
@@ -9,7 +9,7 @@ from .common import InfoExtractor
class DefenseGouvFrIE(InfoExtractor):
IE_NAME = 'defense.gouv.fr'
_VALID_URL = (r'http://.*?\.defense\.gouv\.fr/layout/set/'
- r'ligthboxvideo/base-de-medias/webtv/(.*)')
+ r'ligthboxvideo/base-de-medias/webtv/(.*)')
_TEST = {
'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1',
@@ -26,13 +26,13 @@ class DefenseGouvFrIE(InfoExtractor):
video_id = self._search_regex(
r"flashvars.pvg_id=\"(\d+)\";",
webpage, 'ID')
-
+
json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/'
- + video_id)
+ + video_id)
info = self._download_webpage(json_url, title,
- 'Downloading JSON config')
+ 'Downloading JSON config')
video_url = json.loads(info)['renditions'][0]['url']
-
+
return {'id': video_id,
'ext': 'mp4',
'url': video_url,
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index 554df6735..d3e667528 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -1,47 +1,45 @@
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ int_or_none,
+)
class DiscoveryIE(InfoExtractor):
- _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
+ _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
_TEST = {
'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
- 'md5': 'e12614f9ee303a6ccef415cb0793eba2',
+ 'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
'info_dict': {
- 'id': '614784',
- 'ext': 'mp4',
- 'title': 'MythBusters: Mission Impossible Outtakes',
+ 'id': 'mission-impossible-outtakes',
+ 'ext': 'flv',
+ 'title': 'Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being'
- ' each other -- to the point of confusing Jamie\'s dog -- and '
- 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s'
- ' back.'),
+ ' each other -- to the point of confusing Jamie\'s dog -- and '
+ 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s'
+ ' back.'),
'duration': 156,
+ 'timestamp': 1303099200,
+ 'upload_date': '20110418',
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_list_json = self._search_regex(r'var videoListJSON = ({.*?});',
- webpage, 'video list', flags=re.DOTALL)
- video_list = json.loads(video_list_json)
- info = video_list['clips'][0]
- formats = []
- for f in info['mp4']:
- formats.append(
- {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])})
+ info = self._parse_json(self._search_regex(
+ r'(?s)<script type="application/ld\+json">(.*?)</script>',
+ webpage, 'video info'), video_id)
return {
- 'id': info['contentId'],
- 'title': video_list['name'],
- 'formats': formats,
- 'description': info['videoCaption'],
- 'thumbnail': info.get('videoStillURL') or info.get('thumbnailURL'),
- 'duration': info['duration'],
+ 'id': video_id,
+ 'title': info['name'],
+ 'url': info['contentURL'],
+ 'description': info.get('description'),
+ 'thumbnail': info.get('thumbnailUrl'),
+ 'timestamp': parse_iso8601(info.get('uploadDate')),
+ 'duration': int_or_none(info.get('duration')),
}
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index 5ae0ad5b6..638bb33cd 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -27,7 +27,7 @@ class DotsubIE(InfoExtractor):
video_id = mobj.group('id')
info_url = "https://dotsub.com/api/media/%s/metadata" % video_id
info = self._download_json(info_url, video_id)
- date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
+ date = time.gmtime(info['dateCreated'] / 1000) # The timestamp is in miliseconds
return {
'id': video_id,
diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py
new file mode 100644
index 000000000..7626219ba
--- /dev/null
+++ b/youtube_dl/extractor/drbonanza.py
@@ -0,0 +1,131 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class DRBonanzaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/(?:[^/]+/)+(?:[^/])+?(?:assetId=(?P<id>\d+))?(?:[#&]|$)'
+
+ _TESTS = [{
+ 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
+ 'md5': 'fe330252ddea607635cf2eb2c99a0af3',
+ 'info_dict': {
+ 'id': '65517',
+ 'ext': 'mp4',
+ 'title': 'Talkshowet - Leonard Cohen',
+ 'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca',
+ 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$',
+ 'timestamp': 1295537932,
+ 'upload_date': '20110120',
+ 'duration': 3664,
+ },
+ }, {
+ 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
+ 'md5': '6dfe039417e76795fb783c52da3de11d',
+ 'info_dict': {
+ 'id': '59410',
+ 'ext': 'mp3',
+ 'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission',
+ 'description': 'md5:501e5a195749480552e214fbbed16c4e',
+ 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$',
+ 'timestamp': 1223274900,
+ 'upload_date': '20081006',
+ 'duration': 7369,
+ },
+ }]
+
+ def _real_extract(self, url):
+ url_id = self._match_id(url)
+ webpage = self._download_webpage(url, url_id)
+
+ if url_id:
+ info = json.loads(self._html_search_regex(r'({.*?%s.*})' % url_id, webpage, 'json'))
+ else:
+ # Just fetch the first video on that page
+ info = json.loads(self._html_search_regex(r'bonanzaFunctions.newPlaylist\(({.*})\)', webpage, 'json'))
+
+ asset_id = str(info['AssetId'])
+ title = info['Title'].rstrip(' \'\"-,.:;!?')
+ duration = int_or_none(info.get('Duration'), scale=1000)
+ # First published online. "FirstPublished" contains the date for original airing.
+ timestamp = parse_iso8601(
+ re.sub(r'\.\d+$', '', info['Created']))
+
+ def parse_filename_info(url):
+ match = re.search(r'/\d+_(?P<width>\d+)x(?P<height>\d+)x(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
+ if match:
+ return {
+ 'width': int(match.group('width')),
+ 'height': int(match.group('height')),
+ 'vbr': int(match.group('bitrate')),
+ 'ext': match.group('ext')
+ }
+ match = re.search(r'/\d+_(?P<bitrate>\d+)K\.(?P<ext>\w+)$', url)
+ if match:
+ return {
+ 'vbr': int(match.group('bitrate')),
+ 'ext': match.group(2)
+ }
+ return {}
+
+ video_types = ['VideoHigh', 'VideoMid', 'VideoLow']
+ preferencemap = {
+ 'VideoHigh': -1,
+ 'VideoMid': -2,
+ 'VideoLow': -3,
+ 'Audio': -4,
+ }
+
+ formats = []
+ for file in info['Files']:
+ if info['Type'] == "Video":
+ if file['Type'] in video_types:
+ format = parse_filename_info(file['Location'])
+ format.update({
+ 'url': file['Location'],
+ 'format_id': file['Type'].replace('Video', ''),
+ 'preference': preferencemap.get(file['Type'], -10),
+ })
+ formats.append(format)
+ elif file['Type'] == "Thumb":
+ thumbnail = file['Location']
+ elif info['Type'] == "Audio":
+ if file['Type'] == "Audio":
+ format = parse_filename_info(file['Location'])
+ format.update({
+ 'url': file['Location'],
+ 'format_id': file['Type'],
+ 'vcodec': 'none',
+ })
+ formats.append(format)
+ elif file['Type'] == "Thumb":
+ thumbnail = file['Location']
+
+ description = '%s\n%s\n%s\n' % (
+ info['Description'], info['Actors'], info['Colophon'])
+
+ for f in formats:
+ f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
+ f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
+ self._sort_formats(formats)
+
+ display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
+ display_id = re.sub(r'-+', '-', display_id)
+
+ return {
+ 'id': asset_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py
index 5f24ac721..14b6c00b0 100644
--- a/youtube_dl/extractor/dropbox.py
+++ b/youtube_dl/extractor/dropbox.py
@@ -5,23 +5,24 @@ import os.path
import re
from .common import InfoExtractor
-from ..utils import compat_urllib_parse_unquote, url_basename
+from ..compat import compat_urllib_parse_unquote
+from ..utils import url_basename
class DropboxIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*'
- _TESTS = [{
- 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0',
- 'info_dict': {
- 'id': 'nelirfsxnmcfbfh',
- 'ext': 'mp4',
- 'title': 'youtube-dl test video \'ä"BaW_jenozKc'
- }
- },
- {
- 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v',
- 'only_matching': True,
- },
+ _TESTS = [
+ {
+ 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0',
+ 'info_dict': {
+ 'id': 'nelirfsxnmcfbfh',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video \'ä"BaW_jenozKc'
+ }
+ }, {
+ 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index 9d6ce1f48..c44adb109 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -1,14 +1,12 @@
from __future__ import unicode_literals
-import re
-
from .subtitles import SubtitlesInfoExtractor
from .common import ExtractorError
from ..utils import parse_iso8601
class DRTVIE(SubtitlesInfoExtractor):
- _VALID_URL = r'http://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)+(?P<id>[\da-z-]+)(?:[/#?]|$)'
+ _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)+(?P<id>[\da-z-]+)(?:[/#?]|$)'
_TEST = {
'url': 'http://www.dr.dk/tv/se/partiets-mand/partiets-mand-7-8',
@@ -25,8 +23,7 @@ class DRTVIE(SubtitlesInfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
programcard = self._download_json(
'http://www.dr.dk/mu/programcard/expanded/%s' % video_id, video_id, 'Downloading video JSON')
@@ -35,7 +32,7 @@ class DRTVIE(SubtitlesInfoExtractor):
title = data['Title']
description = data['Description']
- timestamp = parse_iso8601(data['CreatedTime'][:-5])
+ timestamp = parse_iso8601(data['CreatedTime'])
thumbnail = None
duration = None
diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py
new file mode 100644
index 000000000..c1a4bc757
--- /dev/null
+++ b/youtube_dl/extractor/dvtv.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ unescapeHTML,
+ ExtractorError,
+)
+
+
+class DVTVIE(InfoExtractor):
+ IE_NAME = 'dvtv'
+ IE_DESC = 'http://video.aktualne.cz/'
+
+ _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})'
+
+ _TESTS = [{
+ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/',
+ 'md5': '67cb83e4a955d36e1b5d31993134a0c2',
+ 'info_dict': {
+ 'id': 'dc0768de855511e49e4b0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně',
+ }
+ }, {
+ 'url': 'http://video.aktualne.cz/dvtv/stropnicky-policie-vrbetice-preventivne-nekontrolovala/r~82ed4322849211e4a10c0025900fea04/',
+ 'md5': '6388f1941b48537dbd28791f712af8bf',
+ 'info_dict': {
+ 'id': '72c02230849211e49f60002590604f2e',
+ 'ext': 'mp4',
+ 'title': 'Stropnický: Policie Vrbětice preventivně nekontrolovala',
+ }
+ }, {
+ 'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/',
+ 'info_dict': {
+ 'title': 'DVTV 16. 12. 2014: útok Talibanu, boj o kliniku, uprchlíci',
+ 'id': '973eb3bc854e11e498be002590604f2e',
+ },
+ 'playlist': [{
+ 'md5': 'da7ca6be4935532241fa9520b3ad91e4',
+ 'info_dict': {
+ 'id': 'b0b40906854d11e4bdad0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne'
+ }
+ }, {
+ 'md5': '5f7652a08b05009c1292317b449ffea2',
+ 'info_dict': {
+ 'id': '420ad9ec854a11e4bdad0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka'
+ }
+ }, {
+ 'md5': '498eb9dfa97169f409126c617e2a3d64',
+ 'info_dict': {
+ 'id': '95d35580846a11e4b6d20025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?'
+ }
+ }, {
+ 'md5': 'b8dc6b744844032dab6ba3781a7274b9',
+ 'info_dict': {
+ 'id': '6fe14d66853511e4833a0025900fea04',
+ 'ext': 'mp4',
+ 'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády'
+ }
+ }],
+ }, {
+ 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/',
+ 'only_matching': True,
+ }]
+
+ def _parse_video_metadata(self, js, video_id):
+ metadata = self._parse_json(js, video_id, transform_source=js_to_json)
+
+ formats = []
+ for video in metadata['sources']:
+ ext = video['type'][6:]
+ formats.append({
+ 'url': video['file'],
+ 'ext': ext,
+ 'format_id': '%s-%s' % (ext, video['label']),
+ 'height': int(video['label'].rstrip('p')),
+ 'fps': 25,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': metadata['mediaid'],
+ 'title': unescapeHTML(metadata['title']),
+ 'thumbnail': self._proto_relative_url(metadata['image'], 'http:'),
+ 'formats': formats
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ # single video
+ item = self._search_regex(
+ r"(?s)embedData[0-9a-f]{32}\['asset'\]\s*=\s*(\{.+?\});",
+ webpage, 'video', default=None, fatal=False)
+
+ if item:
+ return self._parse_video_metadata(item, video_id)
+
+ # playlist
+ items = re.findall(
+ r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);",
+ webpage)
+
+ if items:
+ return {
+ '_type': 'playlist',
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'entries': [self._parse_video_metadata(i, video_id) for i in items]
+ }
+
+ raise ExtractorError('Could not find neither video nor playlist')
diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py
index 63c2549d3..b6bfd2b2d 100644
--- a/youtube_dl/extractor/ebaumsworld.py
+++ b/youtube_dl/extractor/ebaumsworld.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -20,8 +18,7 @@ class EbaumsWorldIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
config = self._download_xml(
'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id)
video_url = config.find('file').text
diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py
new file mode 100644
index 000000000..d2d94049d
--- /dev/null
+++ b/youtube_dl/extractor/echomsk.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EchoMskIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.echo.msk.ru/sounds/1464134.html',
+ 'md5': '2e44b3b78daff5b458e4dbc37f191f7c',
+ 'info_dict': {
+ 'id': '1464134',
+ 'ext': 'mp3',
+ 'title': 'Особое мнение - 29 декабря 2014, 19:08',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ audio_url = self._search_regex(
+ r'<a rel="mp3" href="([^"]+)">', webpage, 'audio URL')
+
+ title = self._html_search_regex(
+ r'<a href="/programs/[^"]+" target="_blank">([^<]+)</a>',
+ webpage, 'title')
+
+ air_date = self._html_search_regex(
+ r'(?s)<div class="date">(.+?)</div>',
+ webpage, 'date', fatal=False, default=None)
+
+ if air_date:
+ air_date = re.sub(r'(\s)\1+', r'\1', air_date)
+ if air_date:
+ title = '%s - %s' % (title, air_date)
+
+ return {
+ 'id': video_id,
+ 'url': audio_url,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py
index f8f49a013..9cb1bf301 100644
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -1,8 +1,6 @@
from __future__ import unicode_literals
-import re
-
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
from .common import InfoExtractor
@@ -24,11 +22,10 @@ class EHowIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
- webpage, 'video URL')
+ video_url = self._search_regex(
+ r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL')
final_url = compat_urllib_parse.unquote(video_url)
uploader = self._html_search_meta('uploader', webpage)
title = self._og_search_title(webpage).replace(' | eHow', '')
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py
index c1b4c729e..fb5dbbe2b 100644
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -6,9 +6,12 @@ import random
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
)
+from ..utils import (
+ ExtractorError,
+)
class EightTracksIE(InfoExtractor):
@@ -112,25 +115,41 @@ class EightTracksIE(InfoExtractor):
session = str(random.randint(0, 1000000000))
mix_id = data['id']
track_count = data['tracks_count']
+ duration = data['duration']
+ avg_song_duration = float(duration) / track_count
first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
next_url = first_url
entries = []
+
for i in range(track_count):
- api_json = self._download_webpage(
- next_url, playlist_id,
- note='Downloading song information %d/%d' % (i + 1, track_count),
- errnote='Failed to download song information')
+ api_json = None
+ download_tries = 0
+
+ while api_json is None:
+ try:
+ api_json = self._download_webpage(
+ next_url, playlist_id,
+ note='Downloading song information %d/%d' % (i + 1, track_count),
+ errnote='Failed to download song information')
+ except ExtractorError:
+ if download_tries > 3:
+ raise
+ else:
+ download_tries += 1
+ self._sleep(avg_song_duration, playlist_id)
+
api_data = json.loads(api_json)
track_data = api_data['set']['track']
info = {
'id': compat_str(track_data['id']),
'url': track_data['track_file_stream_url'],
- 'title': track_data['performer'] + u' - ' + track_data['name'],
+ 'title': track_data['performer'] + ' - ' + track_data['name'],
'raw_title': track_data['name'],
'uploader_id': data['user']['login'],
'ext': 'm4a',
}
entries.append(info)
+
next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (
session, mix_id, track_data['id'])
return {
diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py
index 3e7923648..fc92ff825 100644
--- a/youtube_dl/extractor/ellentv.py
+++ b/youtube_dl/extractor/ellentv.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import json
from .common import InfoExtractor
@@ -12,32 +11,49 @@ from ..utils import (
class EllenTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'
+ _TESTS = [{
'url': 'http://www.ellentv.com/videos/0-7jqrsr18/',
'md5': 'e4af06f3bf0d5f471921a18db5764642',
'info_dict': {
'id': '0-7jqrsr18',
'ext': 'mp4',
'title': 'What\'s Wrong with These Photos? A Whole Lot',
+ 'description': 'md5:35f152dc66b587cf13e6d2cf4fa467f6',
'timestamp': 1406876400,
'upload_date': '20140801',
}
- }
+ }, {
+ 'url': 'http://ellentube.com/videos/0-dvzmabd5/',
+ 'md5': '98238118eaa2bbdf6ad7f708e3e4f4eb',
+ 'info_dict': {
+ 'id': '0-dvzmabd5',
+ 'ext': 'mp4',
+ 'title': '1 year old twin sister makes her brother laugh',
+ 'description': '1 year old twin sister makes her brother laugh',
+ 'timestamp': 1419542075,
+ 'upload_date': '20141225',
+ }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ video_url = self._html_search_meta('VideoURL', webpage, 'url')
+ title = self._og_search_title(webpage, default=None) or self._search_regex(
+ r'pageName\s*=\s*"([^"]+)"', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description') or self._og_search_description(webpage)
timestamp = parse_iso8601(self._search_regex(
r'<span class="publish-date"><time datetime="([^"]+)">',
webpage, 'timestamp'))
return {
'id': video_id,
- 'title': self._og_search_title(webpage),
- 'url': self._html_search_meta('VideoURL', webpage, 'url'),
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
'timestamp': timestamp,
}
@@ -55,8 +71,7 @@ class EllenTVClipsIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
playlist = self._extract_playlist(webpage)
diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py
index 4277202a2..00a69e631 100644
--- a/youtube_dl/extractor/elpais.py
+++ b/youtube_dl/extractor/elpais.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import unified_strdate
@@ -24,9 +22,7 @@ class ElPaisIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
prefix = self._html_search_regex(
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index 92ada81d2..4ea37ebd9 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .fivemin import FiveMinIE
from ..utils import (
url_basename,
)
@@ -27,11 +26,10 @@ class EngadgetIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
if video_id is not None:
- return FiveMinIE._build_result(video_id)
+ return self.url_result('5min:%s' % video_id)
else:
title = url_basename(url)
webpage = self._download_webpage(url, title)
@@ -39,5 +37,5 @@ class EngadgetIE(InfoExtractor):
return {
'_type': 'playlist',
'title': title,
- 'entries': [FiveMinIE._build_result(id) for id in ids]
+ 'entries': [self.url_result('5min:%s' % vid) for vid in ids]
}
diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py
index bb231ecb1..4de8d4bc5 100644
--- a/youtube_dl/extractor/eporner.py
+++ b/youtube_dl/extractor/eporner.py
@@ -20,7 +20,7 @@ class EpornerIE(InfoExtractor):
'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video',
'ext': 'mp4',
'title': 'Infamous Tiffany Teen Strip Tease Video',
- 'duration': 194,
+ 'duration': 1838,
'view_count': int,
'age_limit': 18,
}
@@ -57,9 +57,7 @@ class EpornerIE(InfoExtractor):
formats.append(fmt)
self._sort_formats(formats)
- duration = parse_duration(self._search_regex(
- r'class="mbtim">([0-9:]+)</div>', webpage, 'duration',
- fatal=False))
+ duration = parse_duration(self._html_search_meta('duration', webpage))
view_count = str_to_int(self._search_regex(
r'id="cinemaviews">\s*([0-9,]+)\s*<small>views',
webpage, 'view count', fatal=False))
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py
new file mode 100644
index 000000000..79e2fbd39
--- /dev/null
+++ b/youtube_dl/extractor/eroprofile.py
@@ -0,0 +1,45 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class EroProfileIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',
+ 'md5': 'c26f351332edf23e1ea28ce9ec9de32f',
+ 'info_dict': {
+ 'id': '3733775',
+ 'display_id': 'sexy-babe-softcore',
+ 'ext': 'm4v',
+ 'title': 'sexy babe softcore',
+ 'thumbnail': 're:https?://.*\.jpg',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
+ webpage, 'video id', default=None)
+
+ video_url = self._search_regex(
+ r'<source src="([^"]+)', webpage, 'video url')
+ title = self._html_search_regex(
+ r'Title:</th><td>([^<]+)</td>', webpage, 'title')
+ thumbnail = self._search_regex(
+ r'onclick="showVideoPlayer\(\)"><img src="([^"]+)',
+ webpage, 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 476fc22b9..e240cb859 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -3,9 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
-
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/everyonesmixtape.py b/youtube_dl/extractor/everyonesmixtape.py
index d237a8281..d872d828f 100644
--- a/youtube_dl/extractor/everyonesmixtape.py
+++ b/youtube_dl/extractor/everyonesmixtape.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
index aacbf1414..36ba33128 100644
--- a/youtube_dl/extractor/extremetube.py
+++ b/youtube_dl/extractor/extremetube.py
@@ -3,16 +3,18 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
+)
+from ..utils import (
str_to_int,
)
class ExtremeTubeIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'md5': '1fb9228f5e3332ec8c057d6ac36f33e0',
@@ -31,7 +33,7 @@ class ExtremeTubeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = mobj.group('id')
url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 3ad993751..1ad4e77a8 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -5,15 +5,18 @@ import re
import socket
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_http_client,
compat_str,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
- urlencode_postdata,
+)
+from ..utils import (
ExtractorError,
+ int_or_none,
limit_length,
+ urlencode_postdata,
)
@@ -34,7 +37,6 @@ class FacebookIE(InfoExtractor):
'info_dict': {
'id': '637842556329505',
'ext': 'mp4',
- 'duration': 38,
'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
}
}, {
@@ -58,8 +60,8 @@ class FacebookIE(InfoExtractor):
login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
login_page_req.add_header('Cookie', 'locale=en_US')
login_page = self._download_webpage(login_page_req, None,
- note='Downloading login page',
- errnote='Unable to download login page')
+ note='Downloading login page',
+ errnote='Unable to download login page')
lsd = self._search_regex(
r'<input type="hidden" name="lsd" value="([^"]*)"',
login_page, 'lsd')
@@ -75,12 +77,12 @@ class FacebookIE(InfoExtractor):
'legacy_return': '1',
'timezone': '-60',
'trynum': '1',
- }
+ }
request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
login_results = self._download_webpage(request, None,
- note='Logging in', errnote='unable to fetch login page')
+ note='Logging in', errnote='unable to fetch login page')
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
@@ -94,7 +96,7 @@ class FacebookIE(InfoExtractor):
check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
check_response = self._download_webpage(check_req, None,
- note='Confirming login')
+ note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -105,9 +107,7 @@ class FacebookIE(InfoExtractor):
self._login()
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
webpage = self._download_webpage(url, video_id)
@@ -147,6 +147,6 @@ class FacebookIE(InfoExtractor):
'id': video_id,
'title': video_title,
'url': video_url,
- 'duration': int(video_data['video_duration']),
- 'thumbnail': video_data['thumbnail_src'],
+ 'duration': int_or_none(video_data.get('video_duration')),
+ 'thumbnail': video_data.get('thumbnail_src'),
}
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py
index c6ab6952e..3c39ca451 100644
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -1,49 +1,48 @@
# encoding: utf-8
-import re
+from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
-)
class FazIE(InfoExtractor):
- IE_NAME = u'faz.net'
+ IE_NAME = 'faz.net'
_VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
_TEST = {
- u'url': u'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
- u'file': u'12610585.mp4',
- u'info_dict': {
- u'title': u'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
- u'description': u'md5:1453fbf9a0d041d985a47306192ea253',
+ 'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
+ 'info_dict': {
+ 'id': '12610585',
+ 'ext': 'mp4',
+ 'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
+ 'description': 'md5:1453fbf9a0d041d985a47306192ea253',
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- self.to_screen(video_id)
+ video_id = self._match_id(url)
+
webpage = self._download_webpage(url, video_id)
- config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage,
- u'config xml url')
- config = self._download_xml(config_xml_url, video_id,
- u'Downloading config xml')
+ config_xml_url = self._search_regex(
+ r'writeFLV\(\'(.+?)\',', webpage, 'config xml url')
+ config = self._download_xml(
+ config_xml_url, video_id, 'Downloading config xml')
encodings = config.find('ENCODINGS')
formats = []
- for code in ['LOW', 'HIGH', 'HQ']:
+ for pref, code in enumerate(['LOW', 'HIGH', 'HQ']):
encoding = encodings.find(code)
if encoding is None:
continue
encoding_url = encoding.find('FILENAME').text
formats.append({
'url': encoding_url,
- 'ext': determine_ext(encoding_url),
'format_id': code.lower(),
+ 'quality': pref,
})
+ self._sort_formats(formats)
- descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
+ descr = self._html_search_regex(
+ r'<p class="Content Copy">(.*?)</p>', webpage, 'description', fatal=False)
return {
'id': video_id,
'title': self._og_search_title(webpage),
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
index 3073ab0d4..1ccc1a964 100644
--- a/youtube_dl/extractor/fc2.py
+++ b/youtube_dl/extractor/fc2.py
@@ -1,23 +1,24 @@
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
import hashlib
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
)
+from ..utils import (
+ ExtractorError,
+)
class FC2IE(InfoExtractor):
- _VALID_URL = r'^http://video\.fc2\.com/((?P<lang>[^/]+)/)?(a/)?content/(?P<id>[^/]+)'
+ _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)?content/(?P<id>[^/]+)'
IE_NAME = 'fc2'
_NETRC_MACHINE = 'fc2'
- _TEST = {
+ _TESTS = [{
'url': 'http://video.fc2.com/en/content/20121103kUan1KHs',
'md5': 'a6ebe8ebe0396518689d963774a54eb7',
'info_dict': {
@@ -25,72 +26,79 @@ class FC2IE(InfoExtractor):
'ext': 'flv',
'title': 'Boxing again with Puff',
},
- }
-
- #def _real_initialize(self):
- # self._login()
+ }, {
+ 'url': 'http://video.fc2.com/en/content/20150125cEva0hDn/',
+ 'info_dict': {
+ 'id': '20150125cEva0hDn',
+ 'ext': 'mp4',
+ },
+ 'params': {
+ 'username': 'ytdl@yt-dl.org',
+ 'password': '(snip)',
+ 'skip': 'requires actual password'
+ }
+ }]
def _login(self):
(username, password) = self._get_login_info()
- if (username is None) or (password is None):
- self.to_screen('unable to log in: will be downloading in non authorized mode') # report_warning
- return False
+ if username is None or password is None:
+ return False
# Log in
login_form_strs = {
- 'email': username,
+ 'email': username,
'password': password,
- 'done': 'video',
- 'Submit': ' Login ',
+ 'done': 'video',
+ 'Submit': ' Login ',
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
- request = compat_urllib_request.Request(
+ request = compat_urllib_request.Request(
'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data)
login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in')
if 'mode=redirect&login=done' not in login_results:
- self.to_screen('unable to log in: bad username or password') # report_warning
+ self.report_warning('unable to log in: bad username or password')
return False
-
+
# this is also needed
login_redir = compat_urllib_request.Request('http://id.fc2.com/?mode=redirect&login=done')
- redir_res = self._download_webpage(login_redir, None, note='Login redirect', errnote='Something is not right')
+ self._download_webpage(
+ login_redir, None, note='Login redirect', errnote='Login redirect failed')
return True
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
+ self._login()
webpage = self._download_webpage(url, video_id)
self._downloader.cookiejar.clear_session_cookies() # must clear
self._login()
title = self._og_search_title(webpage)
thumbnail = self._og_search_thumbnail(webpage)
+ refer = url.replace('/content/', '/a/content/')
- refer = (url if '/a/content/' in url else url.replace('/content/', '/a/content/'));
mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
info_url = (
"http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&".
- format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.','%2E')))
+ format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.', '%2E')))
info_webpage = self._download_webpage(
info_url, video_id, note='Downloading info page')
info = compat_urlparse.parse_qs(info_webpage)
if 'err_code' in info:
- #raise ExtractorError('Error code: %s' % info['err_code'][0])
# most of the time we can still download wideo even if err_code is 403 or 602
- self.to_screen('Error code was: %s... but still trying' % info['err_code'][0]) # report_warning
+ self.report_warning(
+ 'Error code was: %s... but still trying' % info['err_code'][0])
if 'filepath' not in info:
- raise ExtractorError('Cannot download file. Are you logged?')
+ raise ExtractorError('Cannot download file. Are you logged in?')
video_url = info['filepath'][0] + '?mid=' + info['mid'][0]
title_info = info.get('title')
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
index af439ccfe..3191116d9 100644
--- a/youtube_dl/extractor/firedrive.py
+++ b/youtube_dl/extractor/firedrive.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import (
+ ExtractorError,
+)
class FiredriveIE(InfoExtractor):
@@ -28,11 +30,8 @@ class FiredriveIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://firedrive.com/file/%s' % video_id
-
webpage = self._download_webpage(url, video_id)
if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py
index c2e987ff7..08ceee4ed 100644
--- a/youtube_dl/extractor/firsttv.py
+++ b/youtube_dl/extractor/firsttv.py
@@ -44,9 +44,9 @@ class FirstTVIE(InfoExtractor):
duration = self._og_search_property('video:duration', webpage, 'video duration', fatal=False)
like_count = self._html_search_regex(r'title="Понравилось".*?/></label> \[(\d+)\]',
- webpage, 'like count', fatal=False)
+ webpage, 'like count', fatal=False)
dislike_count = self._html_search_regex(r'title="Не понравилось".*?/></label> \[(\d+)\]',
- webpage, 'dislike count', fatal=False)
+ webpage, 'dislike count', fatal=False)
return {
'id': video_id,
@@ -57,4 +57,4 @@ class FirstTVIE(InfoExtractor):
'duration': int_or_none(duration),
'like_count': int_or_none(like_count),
'dislike_count': int_or_none(dislike_count),
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index 3a50bab5c..5b24b921c 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -1,11 +1,11 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
)
@@ -13,7 +13,7 @@ from ..utils import (
class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
_VALID_URL = r'''(?x)
- (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(.*?&)?playList=|
+ (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
5min:)
(?P<id>\d+)
'''
@@ -41,16 +41,11 @@ class FiveMinIE(InfoExtractor):
},
]
- @classmethod
- def _build_result(cls, video_id):
- return cls.url_result('5min:%s' % video_id, cls.ie_key())
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
embed_page = self._download_webpage(embed_url, video_id,
- 'Downloading embed page')
+ 'Downloading embed page')
sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
query = compat_urllib_parse.urlencode({
'func': 'GetResults',
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index d7048c8c1..190d9f9ad 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -1,64 +1,69 @@
+from __future__ import unicode_literals
+
import re
import random
import json
from .common import InfoExtractor
from ..utils import (
- determine_ext,
get_element_by_id,
clean_html,
)
class FKTVIE(InfoExtractor):
- IE_NAME = u'fernsehkritik.tv'
- _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?'
+ IE_NAME = 'fernsehkritik.tv'
+ _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?'
_TEST = {
- u'url': u'http://fernsehkritik.tv/folge-1',
- u'file': u'00011.flv',
- u'info_dict': {
- u'title': u'Folge 1 vom 10. April 2007',
- u'description': u'md5:fb4818139c7cfe6907d4b83412a6864f',
+ 'url': 'http://fernsehkritik.tv/folge-1',
+ 'info_dict': {
+ 'id': '00011',
+ 'ext': 'flv',
+ 'title': 'Folge 1 vom 10. April 2007',
+ 'description': 'md5:fb4818139c7cfe6907d4b83412a6864f',
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- episode = int(mobj.group('ep'))
+ episode = int(self._match_id(url))
- server = random.randint(2, 4)
- video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode
- start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode,
- episode)
+ video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode
+ start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode,
+ episode)
playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage,
- u'playlist', flags=re.DOTALL)
+ 'playlist', flags=re.DOTALL)
files = json.loads(re.sub('{[^{}]*?}', '{}', playlist))
- # TODO: return a single multipart video
+
videos = []
for i, _ in enumerate(files, 1):
video_id = '%04d%d' % (episode, i)
- video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i)
+ video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i)
videos.append({
+ 'ext': 'flv',
'id': video_id,
'url': video_url,
- 'ext': determine_ext(video_url),
'title': clean_html(get_element_by_id('eptitle', start_webpage)),
'description': clean_html(get_element_by_id('contentlist', start_webpage)),
'thumbnail': video_thumbnail
})
- return videos
+ return {
+ '_type': 'multi_video',
+ 'entries': videos,
+ 'id': 'folge-%s' % episode,
+ }
class FKTVPosteckeIE(InfoExtractor):
- IE_NAME = u'fernsehkritik.tv:postecke'
- _VALID_URL = r'(?:http://)?(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
+ IE_NAME = 'fernsehkritik.tv:postecke'
+ _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
_TEST = {
- u'url': u'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
- u'file': u'0120.flv',
- u'md5': u'262f0adbac80317412f7e57b4808e5c4',
- u'info_dict': {
- u"title": u"Postecke 120"
+ 'url': 'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
+ 'md5': '262f0adbac80317412f7e57b4808e5c4',
+ 'info_dict': {
+ 'id': '0120',
+ 'ext': 'flv',
+ 'title': 'Postecke 120',
}
}
@@ -71,8 +76,7 @@ class FKTVPosteckeIE(InfoExtractor):
video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode)
video_title = 'Postecke %d' % episode
return {
- 'id': video_id,
- 'url': video_url,
- 'ext': determine_ext(video_url),
- 'title': video_title,
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
}
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py
index e09982e88..0c858b654 100644
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -17,8 +17,8 @@ class FlickrIE(InfoExtractor):
'info_dict': {
'id': '5645318632',
'ext': 'mp4',
- "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.",
- "uploader_id": "forestwander-nature-pictures",
+ "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.",
+ "uploader_id": "forestwander-nature-pictures",
"title": "Dark Hollow Waterfalls"
}
}
@@ -37,7 +37,7 @@ class FlickrIE(InfoExtractor):
first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
- first_xml, 'node_id')
+ first_xml, 'node_id')
second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py
new file mode 100644
index 000000000..68e2db943
--- /dev/null
+++ b/youtube_dl/extractor/folketinget.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ xpath_text,
+)
+
+
+class FolketingetIE(InfoExtractor):
+ IE_DESC = 'Folketinget (ft.dk; Danish parliament)'
+ _VALID_URL = r'https?://(?:www\.)?ft\.dk/webtv/video/[^?#]*?\.(?P<id>[0-9]+)\.aspx'
+ _TEST = {
+ 'url': 'http://www.ft.dk/webtv/video/20141/eru/td.1165642.aspx?as=1#player',
+ 'info_dict': {
+ 'id': '1165642',
+ 'ext': 'mp4',
+ 'title': 'Åbent samråd i Erhvervsudvalget',
+ 'description': 'Åbent samråd med erhvervs- og vækstministeren om regeringens politik på teleområdet',
+ 'view_count': int,
+ 'width': 768,
+ 'height': 432,
+ 'tbr': 928000,
+ 'timestamp': 1416493800,
+ 'upload_date': '20141120',
+ 'duration': 3960,
+ },
+ 'params': {
+ 'skip_download': 'rtmpdump required',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+ description = self._html_search_regex(
+ r'(?s)<div class="video-item-agenda"[^>]*>(.*?)<',
+ webpage, 'description', fatal=False)
+
+ player_params = compat_parse_qs(self._search_regex(
+ r'<embed src="http://ft\.arkena\.tv/flash/ftplayer\.swf\?([^"]+)"',
+ webpage, 'player params'))
+ xml_url = player_params['xml'][0]
+ doc = self._download_xml(xml_url, video_id)
+
+ timestamp = parse_iso8601(xpath_text(doc, './/date'))
+ duration = parse_duration(xpath_text(doc, './/duration'))
+ width = int_or_none(xpath_text(doc, './/width'))
+ height = int_or_none(xpath_text(doc, './/height'))
+ view_count = int_or_none(xpath_text(doc, './/views'))
+
+ formats = [{
+ 'format_id': n.attrib['bitrate'],
+ 'url': xpath_text(n, './url', fatal=True),
+ 'tbr': int_or_none(n.attrib['bitrate']),
+ } for n in doc.findall('.//streams/stream')]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'width': width,
+ 'height': height,
+ 'duration': duration,
+ 'view_count': view_count,
+ }
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index 7d56b9be9..b2284ab01 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -3,12 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
- unified_strdate,
- str_to_int,
+)
+from ..utils import (
parse_duration,
- clean_html,
+ parse_iso8601,
+ str_to_int,
)
@@ -26,70 +27,81 @@ class FourTubeIE(InfoExtractor):
'uploader': 'WCP Club',
'uploader_id': 'wcp-club',
'upload_date': '20131031',
+ 'timestamp': 1383263892,
'duration': 583,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
- video_id = mobj.group('id')
- webpage_url = 'http://www.4tube.com/videos/' + video_id
- webpage = self._download_webpage(webpage_url, video_id)
+ title = self._html_search_meta('name', webpage)
+ timestamp = parse_iso8601(self._html_search_meta(
+ 'uploadDate', webpage))
+ thumbnail = self._html_search_meta('thumbnailUrl', webpage)
+ uploader_id = self._html_search_regex(
+ r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
+ webpage, 'uploader id')
+ uploader = self._html_search_regex(
+ r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
+ webpage, 'uploader')
- self.report_extraction(video_id)
+ categories_html = self._search_regex(
+ r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>',
+ webpage, 'categories', fatal=False)
+ categories = None
+ if categories_html:
+ categories = [
+ c.strip() for c in re.findall(
+ r'(?s)<li><a.*?>(.*?)</a>', categories_html)]
- playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist')
- media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id')
- sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',')
- title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title')
- thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False)
+ view_count = str_to_int(self._search_regex(
+ r'<meta itemprop="interactionCount" content="UserPlays:([0-9,]+)">',
+ webpage, 'view count', fatal=False))
+ like_count = str_to_int(self._search_regex(
+ r'<meta itemprop="interactionCount" content="UserLikes:([0-9,]+)">',
+ webpage, 'like count', fatal=False))
+ duration = parse_duration(self._html_search_meta('duration', webpage))
- uploader_str = self._search_regex(r'<span>Uploaded by</span>(.*?)<span>', webpage, 'uploader', fatal=False)
- mobj = re.search(r'<a href="/sites/(?P<id>[^"]+)"><strong>(?P<name>[^<]+)</strong></a>', uploader_str)
- (uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None)
+ params_js = self._search_regex(
+ r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
+ webpage, 'initialization parameters'
+ )
+ params = self._parse_json('[%s]' % params_js, video_id)
+ media_id = params[0]
+ sources = ['%s' % p for p in params[2]]
- upload_date = None
- view_count = None
- duration = None
- description = self._html_search_meta('description', webpage, 'description')
- if description:
- upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date',
- fatal=False)
- if upload_date:
- upload_date = unified_strdate(upload_date)
- view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False)
- if view_count:
- view_count = str_to_int(view_count)
- duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False))
-
- token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources))
+ token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format(
+ media_id, '+'.join(sources))
headers = {
- b'Content-Type': b'application/x-www-form-urlencoded',
- b'Origin': b'http://www.4tube.com',
- }
+ b'Content-Type': b'application/x-www-form-urlencoded',
+ b'Origin': b'http://www.4tube.com',
+ }
token_req = compat_urllib_request.Request(token_url, b'{}', headers)
tokens = self._download_json(token_req, video_id)
-
formats = [{
'url': tokens[format]['token'],
'format_id': format + 'p',
'resolution': format + 'p',
'quality': int(format),
- } for format in sources]
-
+ } for format in sources]
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
- 'thumbnail': thumbnail_url,
+ 'categories': categories,
+ 'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
- 'upload_date': upload_date,
+ 'timestamp': timestamp,
+ 'like_count': like_count,
'view_count': view_count,
'duration': duration,
'age_limit': 18,
- 'webpage_url': webpage_url,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py
new file mode 100644
index 000000000..08b8ea362
--- /dev/null
+++ b/youtube_dl/extractor/foxgay.py
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FoxgayIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml'
+ _TEST = {
+ 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml',
+ 'md5': '80d72beab5d04e1655a56ad37afe6841',
+ 'info_dict': {
+ 'id': '2582',
+ 'ext': 'mp4',
+ 'title': 'md5:6122f7ae0fc6b21ebdf59c5e083ce25a',
+ 'description': 'md5:5e51dc4405f1fd315f7927daed2ce5cf',
+ 'age_limit': 18,
+ 'thumbnail': 're:https?://.*\.jpg$',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(
+ r'<title>(?P<title>.*?)</title>',
+ webpage, 'title', fatal=False)
+ description = self._html_search_regex(
+ r'<div class="ico_desc"><h2>(?P<description>.*?)</h2>',
+ webpage, 'description', fatal=False)
+
+ # Find the URL for the iFrame which contains the actual video.
+ iframe = self._download_webpage(
+ self._html_search_regex(r'iframe src="(?P<frame>.*?)"', webpage, 'video frame'),
+ video_id)
+ video_url = self._html_search_regex(
+ r"v_path = '(?P<vid>http://.*?)'", iframe, 'url')
+ thumb_url = self._html_search_regex(
+ r"t_path = '(?P<thumb>http://.*?)'", iframe, 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'description': description,
+ 'thumbnail': thumb_url,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py
new file mode 100644
index 000000000..917f76b1e
--- /dev/null
+++ b/youtube_dl/extractor/foxnews.py
@@ -0,0 +1,94 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ int_or_none,
+)
+
+
+class FoxNewsIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips',
+ 'md5': '32aaded6ba3ef0d1c04e238d01031e5e',
+ 'info_dict': {
+ 'id': '3937480',
+ 'ext': 'flv',
+ 'title': 'Frozen in Time',
+ 'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler',
+ 'duration': 265,
+ 'timestamp': 1304411491,
+ 'upload_date': '20110503',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ 'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips',
+ 'md5': '5846c64a1ea05ec78175421b8323e2df',
+ 'info_dict': {
+ 'id': '3922535568001',
+ 'ext': 'mp4',
+ 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal",
+ 'description': "Congressman discusses the president's executive action",
+ 'duration': 292,
+ 'timestamp': 1417662047,
+ 'upload_date': '20141204',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id)
+
+ item = video['channel']['item']
+ title = item['title']
+ description = item['description']
+ timestamp = parse_iso8601(item['dc-date'])
+
+ media_group = item['media-group']
+ duration = None
+ formats = []
+ for media in media_group['media-content']:
+ attributes = media['@attributes']
+ video_url = attributes['url']
+ if video_url.endswith('.f4m'):
+ formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id))
+ elif video_url.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv'))
+ elif not video_url.endswith('.smil'):
+ duration = int_or_none(attributes.get('duration'))
+ formats.append({
+ 'url': video_url,
+ 'format_id': media['media-category']['@attributes']['label'],
+ 'preference': 1,
+ 'vbr': int_or_none(attributes.get('bitrate')),
+ 'filesize': int_or_none(attributes.get('fileSize'))
+ })
+ self._sort_formats(formats)
+
+ media_thumbnail = media_group['media-thumbnail']['@attributes']
+ thumbnails = [{
+ 'url': media_thumbnail['url'],
+ 'width': int_or_none(media_thumbnail.get('width')),
+ 'height': int_or_none(media_thumbnail.get('height')),
+ }] if media_thumbnail else []
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py
index 898e0dda7..0c2972162 100644
--- a/youtube_dl/extractor/franceculture.py
+++ b/youtube_dl/extractor/franceculture.py
@@ -5,7 +5,7 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_parse_qs,
compat_urlparse,
)
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 0b3374d97..bbc760a49 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -6,13 +6,15 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_urllib_parse_urlparse,
compat_urlparse,
- ExtractorError,
+)
+from ..utils import (
clean_html,
- parse_duration,
- compat_urllib_parse_urlparse,
+ ExtractorError,
int_or_none,
+ parse_duration,
)
@@ -26,6 +28,19 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
if info.get('status') == 'NOK':
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, info['message']), expected=True)
+ allowed_countries = info['videos'][0].get('geoblocage')
+ if allowed_countries:
+ georestricted = True
+ geo_info = self._download_json(
+ 'http://geo.francetv.fr/ws/edgescape.json', video_id,
+ 'Downloading geo restriction info')
+ country = geo_info['reponse']['geo_info']['country_code']
+ if country not in allowed_countries:
+ raise ExtractorError(
+ 'The video is not available from your location',
+ expected=True)
+ else:
+ georestricted = False
formats = []
for video in info['videos']:
@@ -36,6 +51,10 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
continue
format_id = video['format']
if video_url.endswith('.f4m'):
+ if georestricted:
+ # See https://github.com/rg3/youtube-dl/issues/3963
+ # m3u8 urls work fine
+ continue
video_url_parsed = compat_urllib_parse_urlparse(video_url)
f4m_url = self._download_webpage(
'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path,
@@ -46,7 +65,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
f4m_format['preference'] = 1
formats.extend(f4m_formats)
elif video_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(video_url, video_id))
+ formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4'))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@@ -58,7 +77,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
formats.append({
'url': video_url,
'format_id': format_id,
- 'preference': 2,
+ 'preference': -1,
})
self._sort_formats(formats)
@@ -93,7 +112,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
_TESTS = [{
'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
- 'md5': '9cecf35f99c4079c199e9817882a9a1c',
'info_dict': {
'id': '84981923',
'ext': 'flv',
@@ -235,7 +253,7 @@ class GenerationQuoiIE(InfoExtractor):
info_json = self._download_webpage(info_url, name)
info = json.loads(info_json)
return self.url_result('http://www.dailymotion.com/video/%s' % info['id'],
- ie='Dailymotion')
+ ie='Dailymotion')
class CultureboxIE(FranceTVBaseInfoExtractor):
diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py
new file mode 100644
index 000000000..f755e3c4a
--- /dev/null
+++ b/youtube_dl/extractor/freevideo.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class FreeVideoIE(InfoExtractor):
+ _VALID_URL = r'^http://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])'
+
+ _TEST = {
+ 'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html',
+ 'info_dict': {
+ 'id': 'vysukany-zadecek-22033',
+ 'ext': 'mp4',
+ "title": "vysukany-zadecek-22033",
+ "age_limit": 18,
+ },
+ 'skip': 'Blocked outside .cz',
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage, handle = self._download_webpage_handle(url, video_id)
+ if '//www.czechav.com/' in handle.geturl():
+ raise ExtractorError(
+ 'Access to freevideo is blocked from your location',
+ expected=True)
+
+ video_url = self._search_regex(
+ r'\s+url: "(http://[a-z0-9-]+.cdn.freevideo.cz/stream/.*?/video.mp4)"',
+ webpage, 'video URL')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_id,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index d966e8403..a49fc1151 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -8,7 +8,7 @@ from ..utils import ExtractorError
class FunnyOrDieIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
+ _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|articles|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
_TESTS = [{
'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9',
@@ -21,7 +21,6 @@ class FunnyOrDieIE(InfoExtractor):
},
}, {
'url': 'http://www.funnyordie.com/embed/e402820827',
- 'md5': '29f4c5e5a61ca39dfd7e8348a75d0aad',
'info_dict': {
'id': 'e402820827',
'ext': 'mp4',
@@ -29,6 +28,9 @@ class FunnyOrDieIE(InfoExtractor):
'description': 'Please use this to sell something. www.jonlajoie.com',
'thumbnail': 're:^http:.*\.jpg$',
},
+ }, {
+ 'url': 'http://www.funnyordie.com/articles/ebf5e34fc8/10-hours-of-walking-in-nyc-as-a-man',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -37,7 +39,7 @@ class FunnyOrDieIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage)
+ links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage)
if not links:
raise ExtractorError('No media links available for %s' % video_id)
diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py
index 11fee3d31..cf8e90d7d 100644
--- a/youtube_dl/extractor/gamekings.py
+++ b/youtube_dl/extractor/gamekings.py
@@ -11,7 +11,7 @@ class GamekingsIE(InfoExtractor):
'url': 'http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/',
# MD5 is flaky, seems to change regularly
# 'md5': '2f32b1f7b80fdc5cb616efb4f387f8a3',
- u'info_dict': {
+ 'info_dict': {
'id': '20130811',
'ext': 'mp4',
'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review',
diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py
index 3022f539d..a07d69841 100644
--- a/youtube_dl/extractor/gameone.py
+++ b/youtube_dl/extractor/gameone.py
@@ -6,7 +6,9 @@ import re
from .common import InfoExtractor
from ..utils import (
xpath_with_ns,
- parse_iso8601
+ parse_iso8601,
+ float_or_none,
+ int_or_none,
)
NAMESPACE_MAP = {
@@ -21,25 +23,41 @@ RAW_MP4_URL = 'http://cdn.riptide-mtvn.com/'
class GameOneIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gameone\.de/tv/(?P<id>\d+)'
- _TEST = {
- 'url': 'http://www.gameone.de/tv/288',
- 'md5': '136656b7fb4c9cb4a8e2d500651c499b',
- 'info_dict': {
- 'id': '288',
- 'ext': 'mp4',
- 'title': 'Game One - Folge 288',
- 'duration': 1238,
- 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg',
- 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1',
- 'age_limit': 16,
- 'upload_date': '20140513',
- 'timestamp': 1399980122,
+ _TESTS = [
+ {
+ 'url': 'http://www.gameone.de/tv/288',
+ 'md5': '136656b7fb4c9cb4a8e2d500651c499b',
+ 'info_dict': {
+ 'id': '288',
+ 'ext': 'mp4',
+ 'title': 'Game One - Folge 288',
+ 'duration': 1238,
+ 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg',
+ 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1',
+ 'age_limit': 16,
+ 'upload_date': '20140513',
+ 'timestamp': 1399980122,
+ }
+ },
+ {
+ 'url': 'http://gameone.de/tv/220',
+ 'md5': '5227ca74c4ae6b5f74c0510a7c48839e',
+ 'info_dict': {
+ 'id': '220',
+ 'ext': 'mp4',
+ 'upload_date': '20120918',
+ 'description': 'Jet Set Radio HD, Tekken Tag Tournament 2, Source Filmmaker',
+ 'timestamp': 1347971451,
+ 'title': 'Game One - Folge 220',
+ 'duration': 896.62,
+ 'age_limit': 16,
+ }
}
- }
+
+ ]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage, secure=False)
@@ -66,13 +84,13 @@ class GameOneIE(InfoExtractor):
video_id,
'Downloading media:content')
rendition_items = content.findall('.//rendition')
- duration = int(rendition_items[0].get('duration'))
+ duration = float_or_none(rendition_items[0].get('duration'))
formats = [
{
'url': re.sub(r'.*/(r2)', RAW_MP4_URL + r'\1', r.find('./src').text),
- 'width': int(r.get('width')),
- 'height': int(r.get('height')),
- 'tbr': int(r.get('bitrate')),
+ 'width': int_or_none(r.get('width')),
+ 'height': int_or_none(r.get('height')),
+ 'tbr': int_or_none(r.get('bitrate')),
}
for r in rendition_items
]
@@ -105,7 +123,8 @@ class GameOnePlaylistIE(InfoExtractor):
webpage = self._download_webpage('http://www.gameone.de/tv', 'TV')
max_id = max(map(int, re.findall(r'<a href="/tv/(\d+)"', webpage)))
entries = [
- self.url_result('http://www.gameone.de/tv/%d' % video_id, 'GameOne')
+ self.url_result('http://www.gameone.de/tv/%d' %
+ video_id, 'GameOne')
for video_id in range(max_id, 0, -1)]
return {
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 3d67b9d60..47373e215 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -4,16 +4,17 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
+)
+from ..utils import (
unescapeHTML,
- get_meta_content,
)
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
+ _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
_TEST = {
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
@@ -26,10 +27,10 @@ class GameSpotIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('page_id')
+ page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
- data_video_json = self._search_regex(r'data-video=["\'](.*?)["\']', webpage, 'data video')
+ data_video_json = self._search_regex(
+ r'data-video=["\'](.*?)["\']', webpage, 'data video')
data_video = json.loads(unescapeHTML(data_video_json))
# Transform the manifest url to a link to the mp4 files
@@ -41,7 +42,8 @@ class GameSpotIE(InfoExtractor):
http_path = f4m_path[1:].split('/', 1)[1]
http_template = re.sub(QUALITIES_RE, r'%s', http_path)
http_template = http_template.replace('.csmil/manifest.f4m', '')
- http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template)
+ http_template = compat_urlparse.urljoin(
+ 'http://video.gamespotcdn.com/', http_template)
formats = []
for q in qualities:
formats.append({
@@ -52,8 +54,9 @@ class GameSpotIE(InfoExtractor):
return {
'id': data_video['guid'],
+ 'display_id': page_id,
'title': compat_urllib_parse.unquote(data_video['title']),
'formats': formats,
- 'description': get_meta_content('description', webpage),
+ 'description': self._html_search_meta('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py
index 50f8fc7e7..7591a151e 100644
--- a/youtube_dl/extractor/gamestar.py
+++ b/youtube_dl/extractor/gamestar.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -29,9 +27,7 @@ class GameStarIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
og_title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index de14ae1fb..fed968f51 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -39,7 +39,8 @@ class GDCVaultIE(InfoExtractor):
'id': '1015301',
'ext': 'flv',
'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment',
- }
+ },
+ 'skip': 'Requires login',
}
]
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 9057a6beb..a028c4ed4 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -7,21 +7,24 @@ import re
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
compat_xml_parse_error,
-
+)
+from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
HEADRequest,
+ is_html,
orderedSet,
parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ UnsupportedError,
url_basename,
)
from .brightcove import BrightcoveIE
@@ -99,6 +102,22 @@ class GenericIE(InfoExtractor):
'uploader': 'Championat',
},
},
+ {
+ # https://github.com/rg3/youtube-dl/issues/3541
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
+ 'info_dict': {
+ 'id': '3866516442001',
+ 'ext': 'mp4',
+ 'title': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'description': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'uploader': 'SBS Broadcasting',
+ },
+ 'skip': 'Restricted to Netherlands',
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
@@ -113,12 +132,13 @@ class GenericIE(InfoExtractor):
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
+ 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
'ext': 'mp4',
'title': '2cc213299525360.mov', # that's what we get
},
+ 'add_ie': ['Ooyala'],
},
# google redirect
{
@@ -128,7 +148,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20130224',
'uploader_id': 'TheVerge',
- 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
'uploader': 'The Verge',
'title': 'First Firefox OS phones side-by-side',
},
@@ -163,6 +183,14 @@ class GenericIE(InfoExtractor):
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
},
},
+ # BBC iPlayer embeds
+ {
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
+ 'info_dict': {
+ 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_mincount': 18,
+ },
# RUTV embed
{
'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
@@ -325,7 +353,7 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'age_limit': 18,
'uploader': 'www.handjobhub.com',
- 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub',
+ 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
}
},
# RSS feed
@@ -380,6 +408,97 @@ class GenericIE(InfoExtractor):
'uploader': 'education-portal.com',
},
},
+ {
+ 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
+ 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
+ 'info_dict': {
+ 'id': 'uxjb0lwrcz',
+ 'ext': 'mp4',
+ 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+ 'duration': 1715.0,
+ 'uploader': 'thoughtworks.wistia.com',
+ },
+ },
+ # Direct download with broken HEAD
+ {
+ 'url': 'http://ai-radio.org:8000/radio.opus',
+ 'info_dict': {
+ 'id': 'radio',
+ 'ext': 'opus',
+ 'title': 'radio',
+ },
+ 'params': {
+ 'skip_download': True, # infinite live stream
+ },
+ 'expected_warnings': [
+ r'501.*Not Implemented'
+ ],
+ },
+ # Soundcloud embed
+ {
+ 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
+ 'info_dict': {
+ 'id': '174391317',
+ 'ext': 'mp3',
+ 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
+ 'uploader': 'Sophos Security',
+ 'title': 'Chet Chat 171 - Oct 29, 2014',
+ 'upload_date': '20141029',
+ }
+ },
+ # Livestream embed
+ {
+ 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
+ 'info_dict': {
+ 'id': '67864563',
+ 'ext': 'flv',
+ 'upload_date': '20141112',
+ 'title': 'Rosetta #CometLanding webcast HL 10',
+ }
+ },
+ # LazyYT
+ {
+ 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
+ 'info_dict': {
+ 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
+ },
+ 'playlist_mincount': 2,
+ },
+ # Direct link with incorrect MIME type
+ {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'md5': '4ccbebe5f36706d85221f204d7eb5913',
+ 'info_dict': {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'id': '5_Lennart_Poettering_-_Systemd',
+ 'ext': 'webm',
+ 'title': '5_Lennart_Poettering_-_Systemd',
+ 'upload_date': '20141120',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # Cinchcast embed
+ {
+ 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
+ 'info_dict': {
+ 'id': '7141703',
+ 'ext': 'mp3',
+ 'upload_date': '20141126',
+ 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+ }
+ },
+ # Cinerama player
+ {
+ 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
+ 'info_dict': {
+ 'id': '730m_DandD_1901_512k',
+ 'ext': 'mp4',
+ 'uploader': 'www.abc.net.au',
+ 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
+ }
+ }
]
def report_following_redirect(self, new_url):
@@ -472,11 +591,12 @@ class GenericIE(InfoExtractor):
if default_search in ('error', 'fixup_error'):
raise ExtractorError(
- ('%r is not a valid URL. '
- 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
- ) % (url, url), expected=True)
+ '%r is not a valid URL. '
+ 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
+ % (url, url), expected=True)
else:
- assert ':' in default_search
+ if ':' not in default_search:
+ default_search += ':'
return self.url_result(default_search + url)
url, smuggled_data = unsmuggle_url(url)
@@ -491,14 +611,14 @@ class GenericIE(InfoExtractor):
self.to_screen('%s: Requesting header' % video_id)
head_req = HEADRequest(url)
- response = self._request_webpage(
+ head_response = self._request_webpage(
head_req, video_id,
note=False, errnote='Could not send HEAD request to %s' % url,
fatal=False)
- if response is not False:
+ if head_response is not False:
# Check for redirect
- new_url = response.geturl()
+ new_url = head_response.geturl()
if url != new_url:
self.report_following_redirect(new_url)
if force_videoid:
@@ -506,33 +626,53 @@ class GenericIE(InfoExtractor):
new_url, {'force_videoid': force_videoid})
return self.url_result(new_url)
- # Check for direct link to a video
- content_type = response.headers.get('Content-Type', '')
- m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
- if m:
- upload_date = response.headers.get('Last-Modified')
- if upload_date:
- upload_date = unified_strdate(upload_date)
- return {
- 'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
- 'formats': [{
- 'format_id': m.group('format_id'),
- 'url': url,
- 'vcodec': 'none' if m.group('type') == 'audio' else None
- }],
- 'upload_date': upload_date,
- }
+ full_response = None
+ if head_response is False:
+ full_response = self._request_webpage(url, video_id)
+ head_response = full_response
+
+ # Check for direct link to a video
+ content_type = head_response.headers.get('Content-Type', '')
+ m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+ if m:
+ upload_date = unified_strdate(
+ head_response.headers.get('Last-Modified'))
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'direct': True,
+ 'formats': [{
+ 'format_id': m.group('format_id'),
+ 'url': url,
+ 'vcodec': 'none' if m.group('type') == 'audio' else None
+ }],
+ 'upload_date': upload_date,
+ }
if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.')
- try:
- webpage = self._download_webpage(url, video_id)
- except ValueError:
- # since this is the last-resort InfoExtractor, if
- # this error is thrown, it'll be thrown here
- raise ExtractorError('Failed to download URL: %s' % url)
+ if not full_response:
+ full_response = self._request_webpage(url, video_id)
+
+ # Maybe it's a direct link to a video?
+ # Be careful not to download the whole thing!
+ first_bytes = full_response.read(512)
+ if not is_html(first_bytes):
+ self._downloader.report_warning(
+ 'URL could be a direct video link, returning it as such.')
+ upload_date = unified_strdate(
+ head_response.headers.get('Last-Modified'))
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'direct': True,
+ 'url': url,
+ 'upload_date': upload_date,
+ }
+
+ webpage = self._webpage_read_content(
+ full_response, url, video_id, prefix=first_bytes)
self.report_extraction(video_id)
@@ -579,9 +719,9 @@ class GenericIE(InfoExtractor):
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Helper method
- def _playlist_from_matches(matches, getter, ie=None):
+ def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
- self.url_result(self._proto_relative_url(getter(m)), ie)
+ self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
@@ -623,7 +763,8 @@ class GenericIE(InfoExtractor):
<iframe[^>]+?src=|
data-video-url=|
<embed[^>]+?src=|
- embedSWF\(?:\s*
+ embedSWF\(?:\s*|
+ new\s+SWFObject\(
)
(["\'])
(?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
@@ -633,6 +774,12 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches(
matches, lambda m: unescapeHTML(m[1]))
+ # Look for lazyYT YouTube embed
+ matches = re.findall(
+ r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
+ if matches:
+ return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
+
# Look for embedded Dailymotion player
matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
@@ -652,7 +799,7 @@ class GenericIE(InfoExtractor):
# Look for embedded Wistia player
match = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
if match:
embed_url = self._proto_relative_url(
unescapeHTML(match.group('url')))
@@ -664,6 +811,7 @@ class GenericIE(InfoExtractor):
'title': video_title,
'id': video_id,
}
+
match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
if match:
return {
@@ -678,7 +826,7 @@ class GenericIE(InfoExtractor):
# Look for embedded blip.tv player
mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
if mobj:
- return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
+ return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
if mobj:
return self.url_result(mobj.group(1), 'BlipTV')
@@ -714,7 +862,7 @@ class GenericIE(InfoExtractor):
# Look for Ooyala videos
mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
- re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
+ re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
return OoyalaIE._build_url_result(mobj.group('ec'))
@@ -777,6 +925,11 @@ class GenericIE(InfoExtractor):
return _playlist_from_matches(
matches, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for BBC iPlayer embed
+ matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
+ if matches:
+ return _playlist_from_matches(matches, ie='BBCCoUk')
+
# Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage)
if rutv_url:
@@ -784,7 +937,7 @@ class GenericIE(InfoExtractor):
# Look for embedded TED player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
@@ -808,7 +961,7 @@ class GenericIE(InfoExtractor):
# Look for embeded soundcloud player
mobj = re.search(
- r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
+ r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
webpage)
if mobj is not None:
url = unescapeHTML(mobj.group('url'))
@@ -844,8 +997,15 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'SBS')
+ # Look for embedded Cinchcast player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Cinchcast')
+
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
@@ -856,6 +1016,12 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Livestream')
+
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
@@ -891,6 +1057,10 @@ class GenericIE(InfoExtractor):
["']?url["']?\s*:\s*["']([^"']+)["']
''', webpage))
if not found:
+ # Cinerama player
+ found = re.findall(
+ r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
+ if not found:
# Try to find twitter cards info
found = filter_video(re.findall(
r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
@@ -903,7 +1073,7 @@ class GenericIE(InfoExtractor):
found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
if not found:
# HTML5 video
- found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage)
+ found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
if not found:
found = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
@@ -917,7 +1087,7 @@ class GenericIE(InfoExtractor):
'url': new_url,
}
if not found:
- raise ExtractorError('Unsupported URL: %s' % url)
+ raise UnsupportedError(url)
entries = []
for video_url in found:
@@ -949,4 +1119,3 @@ class GenericIE(InfoExtractor):
'_type': 'playlist',
'entries': entries,
}
-
diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py
new file mode 100644
index 000000000..87cd19147
--- /dev/null
+++ b/youtube_dl/extractor/giantbomb.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ unescapeHTML,
+ qualities,
+ int_or_none,
+)
+
+
+class GiantBombIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
+ _TEST = {
+ 'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/',
+ 'md5': '57badeface303ecf6b98b812de1b9018',
+ 'info_dict': {
+ 'id': '2300-9782',
+ 'display_id': 'quick-look-destiny-the-dark-below',
+ 'ext': 'mp4',
+ 'title': 'Quick Look: Destiny: The Dark Below',
+ 'description': 'md5:0aa3aaf2772a41b91d44c63f30dfad24',
+ 'duration': 2399,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ video = json.loads(unescapeHTML(self._search_regex(
+ r'data-video="([^"]+)"', webpage, 'data-video')))
+
+ duration = int_or_none(video.get('lengthSeconds'))
+
+ quality = qualities([
+ 'f4m_low', 'progressive_low', 'f4m_high',
+ 'progressive_high', 'f4m_hd', 'progressive_hd'])
+
+ formats = []
+ for format_id, video_url in video['videoStreams'].items():
+ if format_id == 'f4m_stream':
+ continue
+ if video_url.endswith('.f4m'):
+ f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id)
+ if f4m_formats:
+ f4m_formats[0]['quality'] = quality(format_id)
+ formats.extend(f4m_formats)
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ })
+
+ if not formats:
+ youtube_id = video.get('youtubeID')
+ if youtube_id:
+ return self.url_result(youtube_id, 'Youtube')
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py
new file mode 100644
index 000000000..775890112
--- /dev/null
+++ b/youtube_dl/extractor/giga.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ qualities,
+ compat_str,
+ parse_duration,
+ parse_iso8601,
+ str_to_int,
+)
+
+
+class GigaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/',
+ 'md5': '6bc5535e945e724640664632055a584f',
+ 'info_dict': {
+ 'id': '2622086',
+ 'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss',
+ 'ext': 'mp4',
+ 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss',
+ 'description': 'md5:afdf5862241aded4718a30dff6a57baf',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 578,
+ 'timestamp': 1414749706,
+ 'upload_date': '20141031',
+ 'uploader': 'Robin Schweiger',
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'],
+ webpage, 'video id')
+
+ playlist = self._download_json(
+ 'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/'
+ % video_id, video_id)[0]
+
+ quality = qualities(['normal', 'hd720'])
+
+ formats = []
+ for format_id in itertools.count(0):
+ fmt = playlist.get(compat_str(format_id))
+ if not fmt:
+ break
+ formats.append({
+ 'url': fmt['src'],
+ 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]),
+ 'quality': quality(fmt['quality']),
+ })
+ self._sort_formats(formats)
+
+ title = self._html_search_meta(
+ 'title', webpage, 'title', fatal=True)
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ duration = parse_duration(self._search_regex(
+ r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?<span class="duration">([^<]+)</span>'.format(video_id),
+ webpage, 'duration', fatal=False))
+
+ timestamp = parse_iso8601(self._search_regex(
+ r'datetime="([^"]+)"', webpage, 'upload date', fatal=False))
+ uploader = self._search_regex(
+ r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False)
+
+ view_count = str_to_int(self._search_regex(
+ r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py
new file mode 100644
index 000000000..9561ed5fb
--- /dev/null
+++ b/youtube_dl/extractor/glide.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GlideIE(InfoExtractor):
+ IE_DESC = 'Glide mobile video messages (glide.me)'
+ _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)'
+ _TEST = {
+ 'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==',
+ 'md5': '4466372687352851af2d131cfaa8a4c7',
+ 'info_dict': {
+ 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==',
+ 'ext': 'mp4',
+ 'title': 'Damon Timm\'s Glide message',
+ 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(
+ r'<title>(.*?)</title>', webpage, 'title')
+ video_url = self.http_scheme() + self._search_regex(
+ r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL')
+ thumbnail_url = self._search_regex(
+ r'<img id="video-thumbnail" src="(.*?)"',
+ webpage, 'thumbnail url', fatal=False)
+ thumbnail = (
+ thumbnail_url if thumbnail_url is None
+ else self.http_scheme() + thumbnail_url)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py
index 77c3ad4fc..6949a57c7 100644
--- a/youtube_dl/extractor/globo.py
+++ b/youtube_dl/extractor/globo.py
@@ -5,13 +5,15 @@ import random
import math
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- float_or_none,
+from ..compat import (
compat_str,
compat_chr,
compat_ord,
)
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+)
class GloboIE(InfoExtractor):
@@ -395,4 +397,4 @@ class GloboIE(InfoExtractor):
'uploader_id': uploader_id,
'like_count': like_count,
'formats': formats
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py
new file mode 100644
index 000000000..0fb509724
--- /dev/null
+++ b/youtube_dl/extractor/goldenmoustache.py
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GoldenMoustacheIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?goldenmoustache\.com/(?P<display_id>[\w-]+)-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.goldenmoustache.com/suricate-le-poker-3700/',
+ 'md5': '0f904432fa07da5054d6c8beb5efb51a',
+ 'info_dict': {
+ 'id': '3700',
+ 'ext': 'mp4',
+ 'title': 'Suricate - Le Poker',
+ 'description': 'md5:3d1f242f44f8c8cb0a106f1fd08e5dc9',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://www.goldenmoustache.com/le-lab-tout-effacer-mc-fly-et-carlito-55249/',
+ 'md5': '27f0c50fb4dd5f01dc9082fc67cd5700',
+ 'info_dict': {
+ 'id': '55249',
+ 'ext': 'mp4',
+ 'title': 'Le LAB - Tout Effacer (Mc Fly et Carlito)',
+ 'description': 'md5:9b7fbf11023fb2250bd4b185e3de3b2a',
+ 'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._html_search_regex(
+ r'data-src-type="mp4" data-src="([^"]+)"', webpage, 'video URL')
+ title = self._html_search_regex(
+ r'<title>(.*?)(?: - Golden Moustache)?</title>', webpage, 'title')
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._og_search_description(webpage)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py
index 53714f47f..2bfb99040 100644
--- a/youtube_dl/extractor/golem.py
+++ b/youtube_dl/extractor/golem.py
@@ -2,8 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
+)
+from ..utils import (
determine_ext,
)
diff --git a/youtube_dl/extractor/googlesearch.py b/youtube_dl/extractor/googlesearch.py
index 469e1f935..498304cb2 100644
--- a/youtube_dl/extractor/googlesearch.py
+++ b/youtube_dl/extractor/googlesearch.py
@@ -4,7 +4,7 @@ import itertools
import re
from .common import SearchInfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py
index 45cca1d24..ae24aff84 100644
--- a/youtube_dl/extractor/gorillavid.py
+++ b/youtube_dl/extractor/gorillavid.py
@@ -4,19 +4,21 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- determine_ext,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
class GorillaVidIE(InfoExtractor):
- IE_DESC = 'GorillaVid.in, daclips.in and movpod.in'
+ IE_DESC = 'GorillaVid.in, daclips.in, movpod.in and fastvideo.in'
_VALID_URL = r'''(?x)
https?://(?P<host>(?:www\.)?
- (?:daclips\.in|gorillavid\.in|movpod\.in))/
+ (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in))/
(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
'''
@@ -46,7 +48,17 @@ class GorillaVidIE(InfoExtractor):
'info_dict': {
'id': '3rso4kdn6f9m',
'ext': 'mp4',
- 'title': 'Micro Pig piglets ready on 16th July 2009',
+ 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc',
+ 'thumbnail': 're:http://.*\.jpg',
+ }
+ }, {
+ # video with countdown timeout
+ 'url': 'http://fastvideo.in/1qmdn1lmsmbw',
+ 'md5': '8b87ec3f6564a3108a0e8e66594842ba',
+ 'info_dict': {
+ 'id': '1qmdn1lmsmbw',
+ 'ext': 'mp4',
+ 'title': 'Man of Steel - Trailer',
'thumbnail': 're:http://.*\.jpg',
},
}, {
@@ -69,8 +81,14 @@ class GorillaVidIE(InfoExtractor):
(?:id="[^"]+"\s+)?
value="([^"]*)"
''', webpage))
-
+
if fields['op'] == 'download1':
+ countdown = int_or_none(self._search_regex(
+ r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>',
+ webpage, 'countdown', default=None))
+ if countdown:
+ self._sleep(countdown, video_id)
+
post = compat_urllib_parse.urlencode(fields)
req = compat_urllib_request.Request(url, post)
@@ -78,14 +96,17 @@ class GorillaVidIE(InfoExtractor):
webpage = self._download_webpage(req, video_id, 'Downloading video page')
- title = self._search_regex(r'style="z-index: [0-9]+;">([^<]+)</span>', webpage, 'title')
- video_url = self._search_regex(r'file\s*:\s*\'(http[^\']+)\',', webpage, 'file url')
- thumbnail = self._search_regex(r'image\s*:\s*\'(http[^\']+)\',', webpage, 'thumbnail', fatal=False)
+ title = self._search_regex(
+ r'style="z-index: [0-9]+;">([^<]+)</span>',
+ webpage, 'title', default=None) or self._og_search_title(webpage)
+ video_url = self._search_regex(
+ r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url')
+ thumbnail = self._search_regex(
+ r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', fatal=False)
formats = [{
'format_id': 'sd',
'url': video_url,
- 'ext': determine_ext(video_url),
'quality': 1,
}]
diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py
index 7bca21ad0..b116d251d 100644
--- a/youtube_dl/extractor/goshgay.py
+++ b/youtube_dl/extractor/goshgay.py
@@ -1,73 +1,53 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+)
from ..utils import (
- compat_urlparse,
- str_to_int,
- ExtractorError,
+ parse_duration,
)
-import json
class GoshgayIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)www.goshgay.com/video(?P<id>\d+?)($|/)'
+ _VALID_URL = r'https?://www\.goshgay\.com/video(?P<id>\d+?)($|/)'
_TEST = {
- 'url': 'http://www.goshgay.com/video4116282',
- 'md5': '268b9f3c3229105c57859e166dd72b03',
+ 'url': 'http://www.goshgay.com/video299069/diesel_sfw_xxx_video',
+ 'md5': '027fcc54459dff0feb0bc06a7aeda680',
'info_dict': {
- 'id': '4116282',
+ 'id': '299069',
'ext': 'flv',
- 'title': 'md5:089833a4790b5e103285a07337f245bf',
- 'thumbnail': 're:http://.*\.jpg',
+ 'title': 'DIESEL SFW XXX Video',
+ 'thumbnail': 're:^http://.*\.jpg$',
+ 'duration': 79,
'age_limit': 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._search_regex(r'class="video-title"><h1>(.+?)<', webpage, 'title')
- player_config = self._search_regex(
- r'(?s)jwplayer\("player"\)\.setup\(({.+?})\)', webpage, 'config settings')
- player_vars = json.loads(player_config.replace("'", '"'))
- width = str_to_int(player_vars.get('width'))
- height = str_to_int(player_vars.get('height'))
- config_uri = player_vars.get('config')
+ title = self._html_search_regex(
+ r'<h2>(.*?)<', webpage, 'title')
+ duration = parse_duration(self._html_search_regex(
+ r'<span class="duration">\s*-?\s*(.*?)</span>',
+ webpage, 'duration', fatal=False))
+ family_friendly = self._html_search_meta(
+ 'isFamilyFriendly', webpage, default='false')
- if config_uri is None:
- raise ExtractorError('Missing config URI')
- node = self._download_xml(config_uri, video_id, 'Downloading player config XML',
- errnote='Unable to download XML')
- if node is None:
- raise ExtractorError('Missing config XML')
- if node.tag != 'config':
- raise ExtractorError('Missing config attribute')
- fns = node.findall('file')
- imgs = node.findall('image')
- if len(fns) != 1:
- raise ExtractorError('Missing media URI')
- video_url = fns[0].text
- if len(imgs) < 1:
- thumbnail = None
- else:
- thumbnail = imgs[0].text
-
- url_comp = compat_urlparse.urlparse(url)
- ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2])
+ flashvars = compat_parse_qs(self._html_search_regex(
+ r'<embed.+?id="flash-player-embed".+?flashvars="([^"]+)"',
+ webpage, 'flashvars'))
+ thumbnail = flashvars.get('url_bigthumb', [None])[0]
+ video_url = flashvars['flv_url'][0]
return {
'id': video_id,
'url': video_url,
'title': title,
- 'width': width,
- 'height': height,
'thumbnail': thumbnail,
- 'http_referer': ref,
- 'age_limit': 18,
+ 'duration': duration,
+ 'age_limit': 0 if family_friendly == 'true' else 18,
}
diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py
index 726adff77..fff74a70a 100644
--- a/youtube_dl/extractor/grooveshark.py
+++ b/youtube_dl/extractor/grooveshark.py
@@ -8,12 +8,13 @@ import re
from .common import InfoExtractor
-from ..utils import ExtractorError, compat_urllib_request, compat_html_parser
-
-from ..utils import (
+from ..compat import (
+ compat_html_parser,
compat_urllib_parse,
+ compat_urllib_request,
compat_urlparse,
)
+from ..utils import ExtractorError
class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py
new file mode 100644
index 000000000..8b9e0e2f8
--- /dev/null
+++ b/youtube_dl/extractor/groupon.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GrouponIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.groupon\.com/deals/(?P<id>[^?#]+)'
+
+ _TEST = {
+ 'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
+ 'info_dict': {
+ 'id': 'bikram-yoga-huntington-beach-2',
+ 'title': '$49 for 10 Yoga Classes or One Month of Unlimited Classes at Bikram Yoga Huntington Beach ($180 Value)',
+ 'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
+ 'ext': 'mp4',
+ 'title': 'Bikram Yoga Huntington Beach | Orange County',
+ },
+ }],
+ 'params': {
+ 'skip_download': 'HLS',
+ }
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+
+ payload = self._parse_json(self._search_regex(
+ r'var\s+payload\s*=\s*(.*?);\n', webpage, 'payload'), playlist_id)
+ videos = payload['carousel'].get('dealVideos', [])
+ entries = []
+ for v in videos:
+ if v.get('provider') != 'OOYALA':
+ self.report_warning(
+ '%s: Unsupported video provider %s, skipping video' %
+ (playlist_id, v.get('provider')))
+ continue
+ entries.append(self.url_result('ooyala:%s' % v['media']))
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'entries': entries,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py
index 5bdd08afa..b6cc15b6f 100644
--- a/youtube_dl/extractor/hark.py
+++ b/youtube_dl/extractor/hark.py
@@ -1,37 +1,33 @@
# -*- coding: utf-8 -*-
-
-import re
-import json
+from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import determine_ext
+
class HarkIE(InfoExtractor):
- _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+'
+ _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+'
_TEST = {
- u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013',
- u'file': u'mmbzyhkgny.mp3',
- u'md5': u'6783a58491b47b92c7c1af5a77d4cbee',
- u'info_dict': {
- u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013",
- u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
- u'duration': 11,
+ 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013',
+ 'md5': '6783a58491b47b92c7c1af5a77d4cbee',
+ 'info_dict': {
+ 'id': 'mmbzyhkgny',
+ 'ext': 'mp3',
+ 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013',
+ 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
+ 'duration': 11,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
- json_url = "http://www.hark.com/clips/%s.json" %(video_id)
- info_json = self._download_webpage(json_url, video_id)
- info = json.loads(info_json)
- final_url = info['url']
+ video_id = self._match_id(url)
+ data = self._download_json(
+ 'http://www.hark.com/clips/%s.json' % video_id, video_id)
- return {'id': video_id,
- 'url' : final_url,
- 'title': info['name'],
- 'ext': determine_ext(final_url),
- 'description': info['description'],
- 'thumbnail': info['image_original'],
- 'duration': info['duration'],
- }
+ return {
+ 'id': video_id,
+ 'url': data['url'],
+ 'title': data['name'],
+ 'description': data.get('description'),
+ 'thumbnail': data.get('image_original'),
+ 'duration': data.get('duration'),
+ }
diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py
new file mode 100644
index 000000000..a19b31ac0
--- /dev/null
+++ b/youtube_dl/extractor/hearthisat.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+ compat_urlparse,
+)
+from ..utils import (
+ HEADRequest,
+ str_to_int,
+ urlencode_postdata,
+ urlhandle_detect_ext,
+)
+
+
+class HearThisAtIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
+ _PLAYLIST_URL = 'https://hearthis.at/playlist.php'
+ _TEST = {
+ 'url': 'https://hearthis.at/moofi/dr-kreep',
+ 'md5': 'ab6ec33c8fed6556029337c7885eb4e0',
+ 'info_dict': {
+ 'id': '150939',
+ 'ext': 'wav',
+ 'title': 'Moofi - Dr. Kreep',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1421564134,
+ 'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.',
+ 'upload_date': '20150118',
+ 'comment_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'duration': 71,
+ 'categories': ['Experimental'],
+ }
+ }
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ display_id = '{artist:s} - {title:s}'.format(**m.groupdict())
+
+ webpage = self._download_webpage(url, display_id)
+ track_id = self._search_regex(
+ r'intTrackId\s*=\s*(\d+)', webpage, 'track ID')
+
+ payload = urlencode_postdata({'tracks[]': track_id})
+ req = compat_urllib_request.Request(self._PLAYLIST_URL, payload)
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+
+ track = self._download_json(req, track_id, 'Downloading playlist')[0]
+ title = '{artist:s} - {title:s}'.format(**track)
+
+ categories = None
+ if track.get('category'):
+ categories = [track['category']]
+
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>'
+ view_count = str_to_int(self._search_regex(
+ meta_span % 'plays_count', webpage, 'view count', fatal=False))
+ like_count = str_to_int(self._search_regex(
+ meta_span % 'likes_count', webpage, 'like count', fatal=False))
+ comment_count = str_to_int(self._search_regex(
+ meta_span % 'comment_count', webpage, 'comment count', fatal=False))
+ duration = str_to_int(self._search_regex(
+ r'data-length="(\d+)', webpage, 'duration', fatal=False))
+ timestamp = str_to_int(self._search_regex(
+ r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False))
+
+ formats = []
+ mp3_url = self._search_regex(
+ r'(?s)<a class="player-link"\s+(?:[a-zA-Z0-9_:-]+="[^"]+"\s+)*?data-mp3="([^"]+)"',
+ webpage, 'mp3 URL', fatal=False)
+ if mp3_url:
+ formats.append({
+ 'format_id': 'mp3',
+ 'vcodec': 'none',
+ 'acodec': 'mp3',
+ 'url': mp3_url,
+ })
+ download_path = self._search_regex(
+ r'<a class="[^"]*download_fct[^"]*"\s+href="([^"]+)"',
+ webpage, 'download URL', default=None)
+ if download_path:
+ download_url = compat_urlparse.urljoin(url, download_path)
+ ext_req = HEADRequest(download_url)
+ ext_handle = self._request_webpage(
+ ext_req, display_id, note='Determining extension')
+ ext = urlhandle_detect_ext(ext_handle)
+ formats.append({
+ 'format_id': 'download',
+ 'vcodec': 'none',
+ 'ext': ext,
+ 'url': download_url,
+ 'preference': 2, # Usually better quality
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': track_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'like_count': like_count,
+ 'categories': categories,
+ }
diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py
index f97b1e085..278d9f527 100644
--- a/youtube_dl/extractor/heise.py
+++ b/youtube_dl/extractor/heise.py
@@ -3,7 +3,8 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- get_meta_content,
+ determine_ext,
+ int_or_none,
parse_iso8601,
)
@@ -24,57 +25,54 @@ class HeiseIE(InfoExtractor):
'title': (
"Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone"
),
- 'format_id': 'mp4_720',
+ 'format_id': 'mp4_720p',
'timestamp': 1411812600,
'upload_date': '20140927',
'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
-
webpage = self._download_webpage(url, video_id)
- json_url = self._search_regex(
- r'json_url:\s*"([^"]+)"', webpage, 'json URL')
- config = self._download_json(json_url, video_id)
+
+ container_id = self._search_regex(
+ r'<div class="videoplayerjw".*?data-container="([0-9]+)"',
+ webpage, 'container ID')
+ sequenz_id = self._search_regex(
+ r'<div class="videoplayerjw".*?data-sequenz="([0-9]+)"',
+ webpage, 'sequenz ID')
+ data_url = 'http://www.heise.de/videout/feed?container=%s&sequenz=%s' % (container_id, sequenz_id)
+ doc = self._download_xml(data_url, video_id)
info = {
'id': video_id,
- 'thumbnail': config.get('poster'),
- 'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'timestamp': parse_iso8601(
+ self._html_search_meta('date', webpage)),
'description': self._og_search_description(webpage),
}
- title = get_meta_content('fulltitle', webpage)
+ title = self._html_search_meta('fulltitle', webpage)
if title:
info['title'] = title
- elif config.get('title'):
- info['title'] = config['title']
else:
info['title'] = self._og_search_title(webpage)
formats = []
- for t, rs in config['formats'].items():
- if not rs or not hasattr(rs, 'items'):
- self._downloader.report_warning(
- 'formats: {0}: no resolutions'.format(t))
- continue
-
- for height_str, obj in rs.items():
- format_id = '{0}_{1}'.format(t, height_str)
-
- if not obj or not obj.get('url'):
- self._downloader.report_warning(
- 'formats: {0}: no url'.format(format_id))
- continue
-
- formats.append({
- 'url': obj['url'],
- 'format_id': format_id,
- 'height': self._int(height_str, 'height'),
- })
-
+ for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'):
+ label = source_node.attrib['label']
+ height = int_or_none(self._search_regex(
+ r'^(.*?_)?([0-9]+)p$', label, 'height', default=None))
+ video_url = source_node.attrib['file']
+ ext = determine_ext(video_url, '')
+ formats.append({
+ 'url': video_url,
+ 'format_note': label,
+ 'format_id': '%s_%s' % (ext, label),
+ 'height': height,
+ })
self._sort_formats(formats)
info['formats'] = formats
diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py
new file mode 100644
index 000000000..7a1c75b65
--- /dev/null
+++ b/youtube_dl/extractor/hellporno.py
@@ -0,0 +1,71 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ remove_end,
+)
+
+
+class HellPornoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hellporno\.com/videos/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/',
+ 'md5': '1fee339c610d2049699ef2aa699439f1',
+ 'info_dict': {
+ 'id': '149116',
+ 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic',
+ 'ext': 'mp4',
+ 'title': 'Dixie is posing with naked ass very erotic',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = remove_end(self._html_search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
+
+ flashvars = self._parse_json(self._search_regex(
+ r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'),
+ display_id, transform_source=js_to_json)
+
+ video_id = flashvars.get('video_id')
+ thumbnail = flashvars.get('preview_url')
+ ext = flashvars.get('postfix', '.mp4')[1:]
+
+ formats = []
+ for video_url_key in ['video_url', 'video_alt_url']:
+ video_url = flashvars.get(video_url_key)
+ if not video_url:
+ continue
+ video_text = flashvars.get('%s_text' % video_url_key)
+ fmt = {
+ 'url': video_url,
+ 'ext': ext,
+ 'format_id': video_text,
+ }
+ m = re.search(r'^(?P<height>\d+)[pP]', video_text)
+ if m:
+ fmt['height'] = int(m.group('height'))
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ categories = self._html_search_meta(
+ 'keywords', webpage, 'categories', default='').split(',')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'age_limit': 18,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/helsinki.py b/youtube_dl/extractor/helsinki.py
index 5268efa49..93107b306 100644
--- a/youtube_dl/extractor/helsinki.py
+++ b/youtube_dl/extractor/helsinki.py
@@ -2,9 +2,8 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import js_to_json
class HelsinkiIE(InfoExtractor):
@@ -24,39 +23,21 @@ class HelsinkiIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- formats = []
-
- mobj = re.search(r'file=((\w+):[^&]+)', webpage)
- if mobj:
- formats.append({
- 'ext': mobj.group(2),
- 'play_path': mobj.group(1),
- 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
- 'player_url': 'http://video.helsinki.fi/player.swf',
- 'format_note': 'sd',
- 'quality': 0,
- })
-
- mobj = re.search(r'hd\.file=((\w+):[^&]+)', webpage)
- if mobj:
- formats.append({
- 'ext': mobj.group(2),
- 'play_path': mobj.group(1),
- 'url': 'rtmp://flashvideo.it.helsinki.fi/vod/',
- 'player_url': 'http://video.helsinki.fi/player.swf',
- 'format_note': 'hd',
- 'quality': 1,
- })
+ params = self._parse_json(self._html_search_regex(
+ r'(?s)jwplayer\("player"\).setup\((\{.*?\})\);',
+ webpage, 'player code'), video_id, transform_source=js_to_json)
+ formats = [{
+ 'url': s['file'],
+ 'ext': 'mp4',
+ } for s in params['sources']]
self._sort_formats(formats)
return {
'id': video_id,
'title': self._og_search_title(webpage).replace('Video: ', ''),
'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
}
diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py
new file mode 100644
index 000000000..84bd7c080
--- /dev/null
+++ b/youtube_dl/extractor/hitbox.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ parse_iso8601,
+ float_or_none,
+ int_or_none,
+ compat_str,
+)
+
+
+class HitboxIE(InfoExtractor):
+ IE_NAME = 'hitbox'
+ _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.hitbox.tv/video/203213',
+ 'info_dict': {
+ 'id': '203213',
+ 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy',
+ 'alt_title': 'hitboxlive - Aug 9th #6',
+ 'description': '',
+ 'ext': 'mp4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 215.1666,
+ 'resolution': 'HD 720p',
+ 'uploader': 'hitboxlive',
+ 'view_count': int,
+ 'timestamp': 1407576133,
+ 'upload_date': '20140809',
+ 'categories': ['Live Show'],
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _extract_metadata(self, url, video_id):
+ thumb_base = 'https://edge.sf.hitbox.tv'
+ metadata = self._download_json(
+ '%s/%s' % (url, video_id), video_id)
+
+ date = 'media_live_since'
+ media_type = 'livestream'
+ if metadata.get('media_type') == 'video':
+ media_type = 'video'
+ date = 'media_date_added'
+
+ video_meta = metadata.get(media_type, [])[0]
+ title = video_meta.get('media_status')
+ alt_title = video_meta.get('media_title')
+ description = clean_html(
+ video_meta.get('media_description') or
+ video_meta.get('media_description_md'))
+ duration = float_or_none(video_meta.get('media_duration'))
+ uploader = video_meta.get('media_user_name')
+ views = int_or_none(video_meta.get('media_views'))
+ timestamp = parse_iso8601(video_meta.get(date), ' ')
+ categories = [video_meta.get('category_name')]
+ thumbs = [
+ {'url': thumb_base + video_meta.get('media_thumbnail'),
+ 'width': 320,
+ 'height': 180},
+ {'url': thumb_base + video_meta.get('media_thumbnail_large'),
+ 'width': 768,
+ 'height': 432},
+ ]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'alt_title': alt_title,
+ 'description': description,
+ 'ext': 'mp4',
+ 'thumbnails': thumbs,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'view_count': views,
+ 'timestamp': timestamp,
+ 'categories': categories,
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ metadata = self._extract_metadata(
+ 'https://www.hitbox.tv/api/media/video',
+ video_id)
+
+ player_config = self._download_json(
+ 'https://www.hitbox.tv/api/player/config/video/%s' % video_id,
+ video_id)
+
+ clip = player_config.get('clip')
+ video_url = clip.get('url')
+ res = clip.get('bitrates', [])[0].get('label')
+
+ metadata['resolution'] = res
+ metadata['url'] = video_url
+ metadata['protocol'] = 'm3u8'
+
+ return metadata
+
+
+class HitboxLiveIE(HitboxIE):
+ IE_NAME = 'hitbox:live'
+ _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)'
+ _TEST = {
+ 'url': 'http://www.hitbox.tv/dimak',
+ 'info_dict': {
+ 'id': 'dimak',
+ 'ext': 'mp4',
+ 'description': 'md5:c9f80fa4410bc588d7faa40003fc7d0e',
+ 'timestamp': int,
+ 'upload_date': compat_str,
+ 'title': compat_str,
+ 'uploader': 'Dimak',
+ },
+ 'params': {
+ # live
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ metadata = self._extract_metadata(
+ 'https://www.hitbox.tv/api/media/live',
+ video_id)
+
+ player_config = self._download_json(
+ 'https://www.hitbox.tv/api/player/config/live/%s' % video_id,
+ video_id)
+
+ formats = []
+ cdns = player_config.get('cdns')
+ servers = []
+ for cdn in cdns:
+ base_url = cdn.get('netConnectionUrl')
+ host = re.search('.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1)
+ if base_url not in servers:
+ servers.append(base_url)
+ for stream in cdn.get('bitrates'):
+ label = stream.get('label')
+ if label != 'Auto':
+ formats.append({
+ 'url': '%s/%s' % (base_url, stream.get('url')),
+ 'ext': 'mp4',
+ 'vbr': stream.get('bitrate'),
+ 'resolution': label,
+ 'rtmp_live': True,
+ 'format_note': host,
+ 'page_url': url,
+ 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf',
+ })
+
+ self._sort_formats(formats)
+ metadata['formats'] = formats
+ metadata['is_live'] = True
+ metadata['title'] = self._live_title(metadata.get('title'))
+ return metadata
diff --git a/youtube_dl/extractor/hornbunny.py b/youtube_dl/extractor/hornbunny.py
index 7e7714438..5b6efb27e 100644
--- a/youtube_dl/extractor/hornbunny.py
+++ b/youtube_dl/extractor/hornbunny.py
@@ -37,7 +37,7 @@ class HornBunnyIE(InfoExtractor):
webpage2 = self._download_webpage(redirect_url, video_id)
video_url = self._html_search_regex(
r'flvMask:(.*?);', webpage2, 'video_url')
-
+
duration = parse_duration(self._search_regex(
r'<strong>Runtime:</strong>\s*([0-9:]+)</div>',
webpage, 'duration', fatal=False))
diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py
index 8e812b669..704d0285d 100644
--- a/youtube_dl/extractor/hostingbulk.py
+++ b/youtube_dl/extractor/hostingbulk.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
int_or_none,
urlencode_postdata,
)
@@ -30,9 +32,7 @@ class HostingBulkIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://hostingbulk.com/{0:}.html'.format(video_id)
# Custom request with cookie to set language to English, so our file
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py
index 80b48b1b3..651784b73 100644
--- a/youtube_dl/extractor/hotnewhiphop.py
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -1,12 +1,13 @@
from __future__ import unicode_literals
-import re
import base64
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
HEADRequest,
)
@@ -16,25 +17,24 @@ class HotNewHipHopIE(InfoExtractor):
_VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html'
_TEST = {
'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html',
- 'file': '1435540.mp3',
'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96',
'info_dict': {
+ 'id': '1435540',
+ 'ext': 'mp3',
'title': 'Freddie Gibbs - Lay It Down'
}
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
-
- webpage_src = self._download_webpage(url, video_id)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
video_url_base64 = self._search_regex(
- r'data-path="(.*?)"', webpage_src, u'video URL', fatal=False)
+ r'data-path="(.*?)"', webpage, 'video URL', default=None)
if video_url_base64 is None:
video_url = self._search_regex(
- r'"contentUrl" content="(.*?)"', webpage_src, u'video URL')
+ r'"contentUrl" content="(.*?)"', webpage, 'content URL')
return self.url_result(video_url, ie='Youtube')
reqdata = compat_urllib_parse.urlencode([
@@ -59,11 +59,11 @@ class HotNewHipHopIE(InfoExtractor):
if video_url.endswith('.html'):
raise ExtractorError('Redirect failed')
- video_title = self._og_search_title(webpage_src).strip()
+ video_title = self._og_search_title(webpage).strip()
return {
'id': video_id,
'url': video_url,
'title': video_title,
- 'thumbnail': self._og_search_thumbnail(webpage_src),
+ 'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py
index 6ae04782c..3f7d6666c 100644
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -13,7 +13,7 @@ class HowcastIE(InfoExtractor):
'info_dict': {
'id': '390161',
'ext': 'mp4',
- 'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
+ 'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
'title': 'How to Tie a Square Knot Properly',
}
}
@@ -27,10 +27,10 @@ class HowcastIE(InfoExtractor):
self.report_extraction(video_id)
video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
- webpage, 'video URL')
+ webpage, 'video URL')
video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
- webpage, 'description', fatal=False)
+ webpage, 'description', fatal=False)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py
index fccc23884..e97339121 100644
--- a/youtube_dl/extractor/howstuffworks.py
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -1,12 +1,12 @@
from __future__ import unicode_literals
-import re
-import json
-import random
-import string
-
from .common import InfoExtractor
-from ..utils import find_xpath_attr
+from ..utils import (
+ find_xpath_attr,
+ int_or_none,
+ js_to_json,
+ unescapeHTML,
+)
class HowStuffWorksIE(InfoExtractor):
@@ -16,98 +16,74 @@ class HowStuffWorksIE(InfoExtractor):
'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
'info_dict': {
'id': '450221',
- 'display_id': 'cool-jobs-iditarod-musher',
'ext': 'flv',
'title': 'Cool Jobs - Iditarod Musher',
- 'description': 'md5:82bb58438a88027b8186a1fccb365f90',
+ 'description': 'Cold sleds, freezing temps and warm dog breath... an Iditarod musher\'s dream. Kasey-Dee Gardner jumps on a sled to find out what the big deal is.',
+ 'display_id': 'cool-jobs-iditarod-musher',
'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 161,
},
- 'params': {
- # md5 is not consistent
- 'skip_download': True
- }
},
{
'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm',
'info_dict': {
'id': '453464',
- 'display_id': 'survival-zone-food-and-water-in-the-savanna',
'ext': 'mp4',
'title': 'Survival Zone: Food and Water In the Savanna',
- 'description': 'md5:7e1c89f6411434970c15fa094170c371',
+ 'description': 'Learn how to find both food and water while trekking in the African savannah. In this video from the Discovery Channel.',
+ 'display_id': 'survival-zone-food-and-water-in-the-savanna',
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- # md5 is not consistent
- 'skip_download': True
- }
},
{
'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm',
'info_dict': {
'id': '440011',
- 'display_id': 'sword-swallowing-1-by-dan-meyer',
'ext': 'flv',
'title': 'Sword Swallowing #1 by Dan Meyer',
- 'description': 'md5:b2409e88172913e2e7d3d1159b0ef735',
+ 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International <www.swordswallow.org>',
+ 'display_id': 'sword-swallowing-1-by-dan-meyer',
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- # md5 is not consistent
- 'skip_download': True
- }
},
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
+ clip_js = self._search_regex(
+ r'(?s)var clip = ({.*?});', webpage, 'clip info')
+ clip_info = self._parse_json(
+ clip_js, display_id, transform_source=js_to_json)
- content_id = self._search_regex(r'var siteSectionId="(\d+)";', webpage, 'content id')
-
- mp4 = self._search_regex(
- r'''(?xs)var\s+clip\s*=\s*{\s*
- .+?\s*
- content_id\s*:\s*%s\s*,\s*
- .+?\s*
- mp4\s*:\s*\[(.*?),?\]\s*
- };\s*
- videoData\.push\(clip\);''' % content_id,
- webpage, 'mp4', fatal=False, default=None)
-
- smil = self._download_xml(
- 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % content_id,
- content_id, 'Downloading video SMIL')
-
- http_base = find_xpath_attr(
- smil,
- './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'),
- 'name',
- 'httpBase').get('content')
-
- def random_string(str_len=0):
- return ''.join([random.choice(string.ascii_uppercase) for _ in range(str_len)])
-
- URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=%s&g=%s' % (random_string(5), random_string(12))
-
+ video_id = clip_info['content_id']
formats = []
+ m3u8_url = clip_info.get('m3u8')
+ if m3u8_url:
+ formats += self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+ for video in clip_info.get('mp4', []):
+ formats.append({
+ 'url': video['src'],
+ 'format_id': video['bitrate'],
+ 'vbr': int(video['bitrate'].rstrip('k')),
+ })
+
+ if not formats:
+ smil = self._download_xml(
+ 'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % video_id,
+ video_id, 'Downloading video SMIL')
+
+ http_base = find_xpath_attr(
+ smil,
+ './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'),
+ 'name',
+ 'httpBase').get('content')
+
+ URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=A&g=A'
- if mp4:
- for video in json.loads('[%s]' % mp4):
- bitrate = video['bitrate']
- fmt = {
- 'url': video['src'].replace('http://pmd.video.howstuffworks.com', http_base) + URL_SUFFIX,
- 'format_id': bitrate,
- }
- m = re.search(r'(?P<vbr>\d+)[Kk]', bitrate)
- if m:
- fmt['vbr'] = int(m.group('vbr'))
- formats.append(fmt)
- else:
for video in smil.findall(
- './/{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')):
- vbr = int(video.attrib['system-bitrate']) / 1000
+ './{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')):
+ vbr = int_or_none(video.attrib['system-bitrate'], scale=1000)
formats.append({
'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX),
'format_id': '%dk' % vbr,
@@ -116,19 +92,12 @@ class HowStuffWorksIE(InfoExtractor):
self._sort_formats(formats)
- title = self._og_search_title(webpage)
- TITLE_SUFFIX = ' : HowStuffWorks'
- if title.endswith(TITLE_SUFFIX):
- title = title[:-len(TITLE_SUFFIX)]
-
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
return {
- 'id': content_id,
+ 'id': '%s' % video_id,
'display_id': display_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
+ 'title': unescapeHTML(clip_info['clip_title']),
+ 'description': unescapeHTML(clip_info.get('caption')),
+ 'thumbnail': clip_info.get('video_still_url'),
+ 'duration': clip_info.get('duration'),
'formats': formats,
}
diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py
index 4ccf6b9b8..a38eae421 100644
--- a/youtube_dl/extractor/huffpost.py
+++ b/youtube_dl/extractor/huffpost.py
@@ -39,8 +39,9 @@ class HuffPostIE(InfoExtractor):
data = self._download_json(api_url, video_id)['data']
video_title = data['title']
- duration = parse_duration(data['running_time'])
- upload_date = unified_strdate(data['schedule']['starts_at'])
+ duration = parse_duration(data.get('running_time'))
+ upload_date = unified_strdate(
+ data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
description = data.get('description')
thumbnails = []
@@ -59,16 +60,11 @@ class HuffPostIE(InfoExtractor):
'ext': 'mp4',
'url': url,
'vcodec': 'none' if key.startswith('audio/') else None,
- } for key, url in data['sources']['live'].items()]
- if data.get('fivemin_id'):
- fid = data['fivemin_id']
- fcat = str(int(fid) // 100 + 1)
- furl = 'http://avideos.5min.com/2/' + fcat[-3:] + '/' + fcat + '/' + fid + '.mp4'
- formats.append({
- 'format': 'fivemin',
- 'url': furl,
- 'preference': 1,
- })
+ } for key, url in data.get('sources', {}).get('live', {}).items()]
+
+ if not formats and data.get('fivemin_id'):
+ return self.url_result('5min:%s' % data['fivemin_id'])
+
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py
index 6d0d847c6..aa0724a02 100644
--- a/youtube_dl/extractor/hypem.py
+++ b/youtube_dl/extractor/hypem.py
@@ -1,20 +1,20 @@
from __future__ import unicode_literals
import json
-import re
import time
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
)
class HypemIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
+ _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME',
'md5': 'b9cc91b5af8995e9f0c1cee04c575828',
@@ -27,8 +27,7 @@ class HypemIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- track_id = mobj.group(1)
+ track_id = self._match_id(url)
data = {'ax': 1, 'ts': time.time()}
data_encoded = compat_urllib_parse.urlencode(data)
diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py
index 1d5a10a3b..370e86e5a 100644
--- a/youtube_dl/extractor/iconosquare.py
+++ b/youtube_dl/extractor/iconosquare.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -20,13 +18,11 @@ class IconosquareIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- html_title = self._html_search_regex(
- r'<title>(.+?)</title>',
+ title = self._html_search_regex(
+ r'<title>(.+?)(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)</title>',
webpage, 'title')
- title = re.sub(r'(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)$', '', html_title)
uploader_id = self._html_search_regex(
r'@([^ ]+)', title, 'uploader name', fatal=False)
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index c80185b53..3db668cd0 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -63,8 +63,10 @@ class IGNIE(InfoExtractor):
'id': '078fdd005f6d3c02f63d795faa1b984f',
'ext': 'mp4',
'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
- 'description': 'Giant skeletons, bloody hunts, and captivating'
- ' natural beauty take our breath away.',
+ 'description': (
+ 'Giant skeletons, bloody hunts, and captivating'
+ ' natural beauty take our breath away.'
+ ),
},
},
]
@@ -99,7 +101,7 @@ class IGNIE(InfoExtractor):
video_id = self._find_video_id(webpage)
result = self._get_video_info(video_id)
description = self._html_search_regex(self._DESCRIPTION_RE,
- webpage, 'video description', flags=re.DOTALL)
+ webpage, 'video description', flags=re.DOTALL)
result['description'] = description
return result
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 4536db3bf..f29df36b5 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -4,9 +4,8 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
- get_element_by_attribute,
)
@@ -17,7 +16,6 @@ class ImdbIE(InfoExtractor):
_TEST = {
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
- 'md5': '9f34fa777ade3a6e57a054fdbcb3a068',
'info_dict': {
'id': '2524815897',
'ext': 'mp4',
@@ -27,10 +25,11 @@ class ImdbIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
- descr = get_element_by_attribute('itemprop', 'description', webpage)
+ descr = self._html_search_regex(
+ r'(?s)<span itemprop="description">(.*?)</span>',
+ webpage, 'description', fatal=False)
available_formats = re.findall(
r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
flags=re.MULTILINE)
@@ -71,11 +70,9 @@ class ImdbListIE(InfoExtractor):
},
'playlist_count': 7,
}
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- list_id = mobj.group('id')
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
webpage = self._download_webpage(url, list_id)
entries = [
self.url_result('http://www.imdb.com' + m, 'Imdb')
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index e76dd222d..f25f43664 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -1,10 +1,9 @@
from __future__ import unicode_literals
import base64
-import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -24,9 +23,7 @@ class InfoQIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 5109f26ce..b020e2621 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -27,9 +27,9 @@ class InstagramIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
- webpage, 'uploader id', fatal=False)
+ webpage, 'uploader id', fatal=False)
desc = self._search_regex(r'"caption":"(.*?)"', webpage, 'description',
- fatal=False)
+ fatal=False)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index 53f9a5f75..483cc6f9e 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
compat_urllib_parse,
+)
+from ..utils import (
xpath_with_ns,
)
@@ -20,7 +22,7 @@ class InternetVideoArchiveIE(InfoExtractor):
'ext': 'mp4',
'title': 'SKYFALL',
'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
- 'duration': 149,
+ 'duration': 152,
},
}
@@ -32,7 +34,7 @@ class InternetVideoArchiveIE(InfoExtractor):
def _clean_query(query):
NEEDED_ARGS = ['publishedid', 'customerid']
query_dic = compat_urlparse.parse_qs(query)
- cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS)
+ cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS)
# Other player ids return m3u8 urls
cleaned_dic['playerid'] = '247'
cleaned_dic['videokbrate'] = '100000'
@@ -45,22 +47,26 @@ class InternetVideoArchiveIE(InfoExtractor):
url = self._build_url(query)
flashconfiguration = self._download_xml(url, video_id,
- 'Downloading flash configuration')
+ 'Downloading flash configuration')
file_url = flashconfiguration.find('file').text
file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
# Replace some of the parameters in the query to get the best quality
# and http links (no m3u8 manifests)
file_url = re.sub(r'(?<=\?)(.+)$',
- lambda m: self._clean_query(m.group()),
- file_url)
+ lambda m: self._clean_query(m.group()),
+ file_url)
info = self._download_xml(file_url, video_id,
- 'Downloading video info')
+ 'Downloading video info')
item = info.find('channel/item')
def _bp(p):
- return xpath_with_ns(p,
- {'media': 'http://search.yahoo.com/mrss/',
- 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'})
+ return xpath_with_ns(
+ p,
+ {
+ 'media': 'http://search.yahoo.com/mrss/',
+ 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats',
+ }
+ )
formats = []
for content in item.findall(_bp('media:group/media:content')):
attr = content.attrib
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index d1defd363..8529bedfc 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -6,8 +6,10 @@ from random import random
from math import floor
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
@@ -54,7 +56,7 @@ class IPrimaIE(InfoExtractor):
player_url = (
'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' %
- (floor(random()*1073741824), floor(random()*1073741824))
+ (floor(random() * 1073741824), floor(random() * 1073741824))
)
req = compat_urllib_request.Request(player_url)
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 75b543b7c..7a400323d 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -5,8 +5,10 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
@@ -43,7 +45,7 @@ class IviIE(InfoExtractor):
'thumbnail': 'http://thumbs.ivi.ru/f15.vcp.digitalaccess.ru/contents/8/4/0068dc0677041f3336b7c2baad8fc0.jpg',
},
'skip': 'Only works from Russia',
- }
+ }
]
# Sorted by quality
@@ -102,7 +104,7 @@ class IviIE(InfoExtractor):
compilation = result['compilation']
title = result['title']
- title = '%s - %s' % (compilation, title) if compilation is not None else title
+ title = '%s - %s' % (compilation, title) if compilation is not None else title
previews = result['preview']
previews.sort(key=lambda fmt: self._known_thumbnails.index(fmt['content_format']))
@@ -152,17 +154,17 @@ class IviCompilationIE(InfoExtractor):
compilation_id = mobj.group('compilationid')
season_id = mobj.group('seasonid')
- if season_id is not None: # Season link
+ if season_id is not None: # Season link
season_page = self._download_webpage(url, compilation_id, 'Downloading season %s web page' % season_id)
playlist_id = '%s/season%s' % (compilation_id, season_id)
playlist_title = self._html_search_meta('title', season_page, 'title')
entries = self._extract_entries(season_page, compilation_id)
- else: # Compilation link
+ else: # Compilation link
compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page')
playlist_id = compilation_id
playlist_title = self._html_search_meta('title', compilation_page, 'title')
seasons = re.findall(r'<a href="/watch/%s/season(\d+)">[^<]+</a>' % compilation_id, compilation_page)
- if len(seasons) == 0: # No seasons in this compilation
+ if len(seasons) == 0: # No seasons in this compilation
entries = self._extract_entries(compilation_page, compilation_id)
else:
entries = []
@@ -172,4 +174,4 @@ class IviCompilationIE(InfoExtractor):
compilation_id, 'Downloading season %s web page' % season_id)
entries.extend(self._extract_entries(season_page, compilation_id))
- return self.playlist_result(entries, playlist_id, playlist_title) \ No newline at end of file
+ return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index 07ef682ee..d16d483ee 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -5,11 +5,11 @@ import re
from .common import InfoExtractor
from ..utils import (
- get_element_by_id,
- parse_iso8601,
determine_ext,
- int_or_none,
float_or_none,
+ get_element_by_id,
+ int_or_none,
+ parse_iso8601,
str_to_int,
)
@@ -30,7 +30,7 @@ class IzleseneIE(InfoExtractor):
'description': 'md5:253753e2655dde93f59f74b572454f6d',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'pelikzzle',
- 'timestamp': 1404298698,
+ 'timestamp': 1404302298,
'upload_date': '20140702',
'duration': 95.395,
'age_limit': 0,
@@ -46,7 +46,7 @@ class IzleseneIE(InfoExtractor):
'description': 'Tarkan Dortmund 2006 Konseri',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'parlayankiz',
- 'timestamp': 1163318593,
+ 'timestamp': 1163322193,
'upload_date': '20061112',
'duration': 253.666,
'age_limit': 0,
@@ -55,10 +55,9 @@ class IzleseneIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- url = 'http://www.izlesene.com/video/%s' % video_id
+ video_id = self._match_id(url)
+ url = 'http://www.izlesene.com/video/%s' % video_id
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py
index ace08769b..063e86de4 100644
--- a/youtube_dl/extractor/jadorecettepub.py
+++ b/youtube_dl/extractor/jadorecettepub.py
@@ -45,4 +45,3 @@ class JadoreCettePubIE(InfoExtractor):
'title': title,
'description': description,
}
-
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index 188165966..8094cc2e4 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -29,7 +29,7 @@ class JeuxVideoIE(InfoExtractor):
xml_link = self._html_search_regex(
r'<param name="flashvars" value="config=(.*?)" />',
webpage, 'config URL')
-
+
video_id = self._search_regex(
r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
xml_link, 'video ID')
@@ -38,7 +38,7 @@ class JeuxVideoIE(InfoExtractor):
xml_link, title, 'Downloading XML config')
info_json = config.find('format.json').text
info = json.loads(info_json)['versions'][0]
-
+
video_url = 'http://video720.jeuxvideo.com/' + info['file']
return {
diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py
index 5aa32bf09..da8068efc 100644
--- a/youtube_dl/extractor/jukebox.py
+++ b/youtube_dl/extractor/jukebox.py
@@ -36,7 +36,7 @@ class JukeboxIE(InfoExtractor):
try:
video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
- iframe_html, 'video url')
+ iframe_html, 'video url')
video_url = unescapeHTML(video_url).replace('\/', '/')
except RegexNotFoundError:
youtube_url = self._search_regex(
@@ -47,9 +47,9 @@ class JukeboxIE(InfoExtractor):
return self.url_result(youtube_url, ie='Youtube')
title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
- html, 'title')
+ html, 'title')
artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
- html, 'artist')
+ html, 'artist')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py
index 23103b163..dbfe4cc03 100644
--- a/youtube_dl/extractor/kankan.py
+++ b/youtube_dl/extractor/kankan.py
@@ -10,7 +10,7 @@ _md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
class KankanIE(InfoExtractor):
_VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml'
-
+
_TEST = {
'url': 'http://yinyue.kankan.com/vod/48/48863.shtml',
'file': '48863.flv',
diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py
new file mode 100644
index 000000000..e3b43ff8d
--- /dev/null
+++ b/youtube_dl/extractor/karaoketv.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ js_to_json,
+)
+
+
+class KaraoketvIE(InfoExtractor):
+ _VALID_URL = r'http://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://karaoketv.co.il/?container=songs&id=171568',
+ 'info_dict': {
+ 'id': '171568',
+ 'ext': 'mp4',
+ 'title': 'אל העולם שלך - רותם כהן - שרים קריוקי',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ page_video_url = self._og_search_video_url(webpage, video_id)
+ config_json = compat_urllib_parse.unquote_plus(self._search_regex(
+ r'config=(.*)', page_video_url, 'configuration'))
+
+ urls_info_json = self._download_json(
+ config_json, video_id, 'Downloading configuration',
+ transform_source=js_to_json)
+
+ url = urls_info_json['playlist'][0]['url']
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'url': url,
+ }
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py
index 5d679e88d..c0956ba09 100644
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -1,34 +1,39 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class KeekIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<id>\w+)'
IE_NAME = 'keek'
_TEST = {
'url': 'https://www.keek.com/ytdl/keeks/NODfbab',
- 'file': 'NODfbab.mp4',
- 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83',
+ 'md5': '09c5c109067536c1cec8bac8c21fea05',
'info_dict': {
- 'uploader': 'ytdl',
+ 'id': 'NODfbab',
+ 'ext': 'mp4',
+ 'uploader': 'youtube-dl project',
+ 'uploader_id': 'ytdl',
'title': 'test chars: "\'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de .',
},
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('videoID')
+ video_id = self._match_id(url)
video_url = 'http://cdn.keek.com/keek/video/%s' % video_id
thumbnail = 'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- uploader = self._html_search_regex(
- r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
- webpage, 'uploader', fatal=False)
+ raw_desc = self._html_search_meta('description', webpage)
+ if raw_desc:
+ uploader = self._html_search_regex(
+ r'Watch (.*?)\s+\(', raw_desc, 'uploader', fatal=False)
+ uploader_id = self._html_search_regex(
+ r'Watch .*?\(@(.+?)\)', raw_desc, 'uploader_id', fatal=False)
+ else:
+ uploader = None
+ uploader_id = None
return {
'id': video_id,
@@ -36,5 +41,6 @@ class KeekIE(InfoExtractor):
'ext': 'mp4',
'title': self._og_search_title(webpage),
'thumbnail': thumbnail,
- 'uploader': uploader
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
}
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 75b63cffb..97dcb518a 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -4,7 +4,7 @@ import os
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
@@ -15,7 +15,7 @@ from ..aes import (
class KeezMoviesIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P<id>[0-9]+)(?:[/?&]|$)'
_TEST = {
'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
'file': '1214711.mp4',
@@ -27,8 +27,7 @@ class KeezMoviesIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py
index 408d00944..08a671fa8 100644
--- a/youtube_dl/extractor/khanacademy.py
+++ b/youtube_dl/extractor/khanacademy.py
@@ -22,8 +22,10 @@ class KhanAcademyIE(InfoExtractor):
'description': 'The perfect cipher',
'duration': 176,
'uploader': 'Brit Cruise',
+ 'uploader_id': 'khanacademy',
'upload_date': '20120411',
- }
+ },
+ 'add_ie': ['Youtube'],
}, {
'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
'info_dict': {
diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py
index 56a76380c..7d4b57056 100644
--- a/youtube_dl/extractor/kickstarter.py
+++ b/youtube_dl/extractor/kickstarter.py
@@ -1,8 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -15,28 +13,25 @@ class KickStarterIE(InfoExtractor):
'id': '1404461844',
'ext': 'mp4',
'title': 'Intersection: The Story of Josh Grant by Kyle Cowling',
- 'description': 'A unique motocross documentary that examines the '
- 'life and mind of one of sports most elite athletes: Josh Grant.',
+ 'description': (
+ 'A unique motocross documentary that examines the '
+ 'life and mind of one of sports most elite athletes: Josh Grant.'
+ ),
},
}, {
'note': 'Embedded video (not using the native kickstarter video service)',
'url': 'https://www.kickstarter.com/projects/597507018/pebble-e-paper-watch-for-iphone-and-android/posts/659178',
- 'playlist': [
- {
- 'info_dict': {
- 'id': '78704821',
- 'ext': 'mp4',
- 'uploader_id': 'pebble',
- 'uploader': 'Pebble Technology',
- 'title': 'Pebble iOS Notifications',
- }
- }
- ],
+ 'info_dict': {
+ 'id': '78704821',
+ 'ext': 'mp4',
+ 'uploader_id': 'pebble',
+ 'uploader': 'Pebble Technology',
+ 'title': 'Pebble iOS Notifications',
+ }
}]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
index 8a73ecfa0..720bc939b 100644
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@@ -10,13 +10,14 @@ from ..utils import int_or_none
class KontrTubeIE(InfoExtractor):
IE_NAME = 'kontrtube'
IE_DESC = 'KontrTube.ru - Труба зовёт'
- _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+'
+ _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/'
_TEST = {
'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
'md5': '975a991a4926c9a85f383a736a2e6b80',
'info_dict': {
'id': '2678',
+ 'display_id': 'nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag',
'ext': 'mp4',
'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
@@ -28,21 +29,28 @@ class KontrTubeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, video_id, 'Downloading page')
+ webpage = self._download_webpage(
+ url, display_id, 'Downloading page')
- video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
- thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
+ video_url = self._html_search_regex(
+ r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL')
+ thumbnail = self._html_search_regex(
+ r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False)
title = self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'video title')
- description = self._html_search_meta('description', webpage, 'video description')
+ description = self._html_search_meta(
+ 'description', webpage, 'video description')
mobj = re.search(
- r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage)
+ r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
+ webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
view_count = self._html_search_regex(
- r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False)
+ r'<div class="col_2">Просмотров: <span>(\d+)</span></div>',
+ webpage, 'view count', fatal=False)
comment_count = None
comment_str = self._html_search_regex(
@@ -56,6 +64,7 @@ class KontrTubeIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'url': video_url,
'thumbnail': thumbnail,
'title': title,
@@ -63,4 +72,4 @@ class KontrTubeIE(InfoExtractor):
'duration': duration,
'view_count': int_or_none(view_count),
'comment_count': int_or_none(comment_count),
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py
index 6f3d2345b..e46954b47 100644
--- a/youtube_dl/extractor/krasview.py
+++ b/youtube_dl/extractor/krasview.py
@@ -2,18 +2,17 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
- unescapeHTML,
+ js_to_json,
)
class KrasViewIE(InfoExtractor):
IE_DESC = 'Красвью'
- _VALID_URL = r'https?://krasview\.ru/video/(?P<id>\d+)'
+ _VALID_URL = r'https?://krasview\.ru/(?:video|embed)/(?P<id>\d+)'
_TEST = {
'url': 'http://krasview.ru/video/512228',
@@ -29,20 +28,18 @@ class KrasViewIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- flashvars = json.loads(self._search_regex(
- r'flashvars\s*:\s*({.+?})\s*}\);', webpage, 'flashvars'))
+ flashvars = json.loads(js_to_json(self._search_regex(
+ r'video_Init\(({.+?})', webpage, 'flashvars')))
video_url = flashvars['url']
- title = unescapeHTML(flashvars['title'])
- description = unescapeHTML(flashvars.get('subtitle') or self._og_search_description(webpage, default=None))
- thumbnail = flashvars['image']
- duration = int(flashvars['duration'])
- filesize = int(flashvars['size'])
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage, default=None)
+ thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage)
+ duration = int_or_none(flashvars.get('duration'))
width = int_or_none(self._og_search_property('video:width', webpage, 'video width'))
height = int_or_none(self._og_search_property('video:height', webpage, 'video height'))
@@ -53,7 +50,6 @@ class KrasViewIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'duration': duration,
- 'filesize': filesize,
'width': width,
'height': height,
}
diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py
index 484239b19..a602980a1 100644
--- a/youtube_dl/extractor/ku6.py
+++ b/youtube_dl/extractor/ku6.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -18,11 +16,11 @@ class Ku6IE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._search_regex(r'<h1 title=.*>(.*?)</h1>', webpage, 'title')
+
+ title = self._html_search_regex(
+ r'<h1 title=.*>(.*?)</h1>', webpage, 'title')
dataUrl = 'http://v.ku6.com/fetchVideo4Player/%s.html' % video_id
jsonData = self._download_json(dataUrl, video_id)
downloadUrl = jsonData['data']['f']
@@ -32,4 +30,3 @@ class Ku6IE(InfoExtractor):
'title': title,
'url': downloadUrl
}
-
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
new file mode 100644
index 000000000..2fd3b4699
--- /dev/null
+++ b/youtube_dl/extractor/laola1tv.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+import random
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class Laola1TvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/.*?/(?P<id>[0-9]+)\.html'
+ _TEST = {
+ 'url': 'http://www.laola1.tv/de-de/live/bwf-bitburger-open-grand-prix-gold-court-1/250019.html',
+ 'info_dict': {
+ 'id': '250019',
+ 'ext': 'mp4',
+ 'title': 'Bitburger Open Grand Prix Gold - Court 1',
+ 'categories': ['Badminton'],
+ 'uploader': 'BWF - Badminton World Federation',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }
+
+ _BROKEN = True # Not really - extractor works fine, but f4m downloader does not support live streams yet.
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ lang = mobj.group('lang')
+ portal = mobj.group('portal')
+
+ webpage = self._download_webpage(url, video_id)
+ iframe_url = self._search_regex(
+ r'<iframe[^>]*?class="main_tv_player"[^>]*?src="([^"]+)"',
+ webpage, 'iframe URL')
+
+ iframe = self._download_webpage(
+ iframe_url, video_id, note='Downloading iframe')
+ flashvars_m = re.findall(
+ r'flashvars\.([_a-zA-Z0-9]+)\s*=\s*"([^"]*)";', iframe)
+ flashvars = dict((m[0], m[1]) for m in flashvars_m)
+
+ xml_url = ('http://www.laola1.tv/server/hd_video.php?' +
+ 'play=%s&partner=1&portal=%s&v5ident=&lang=%s' % (
+ video_id, portal, lang))
+ hd_doc = self._download_xml(xml_url, video_id)
+
+ title = hd_doc.find('.//video/title').text
+ flash_url = hd_doc.find('.//video/url').text
+ categories = hd_doc.find('.//video/meta_sports').text.split(',')
+ uploader = hd_doc.find('.//video/meta_organistation').text
+
+ ident = random.randint(10000000, 99999999)
+ token_url = '%s&ident=%s&klub=0&unikey=0&timestamp=%s&auth=%s' % (
+ flash_url, ident, flashvars['timestamp'], flashvars['auth'])
+
+ token_doc = self._download_xml(
+ token_url, video_id, note='Downloading token')
+ token_attrib = token_doc.find('.//token').attrib
+ if token_attrib.get('auth') == 'blocked':
+ raise ExtractorError('Token error: ' % token_attrib.get('comment'))
+
+ video_url = '%s?hdnea=%s&hdcore=3.2.0' % (
+ token_attrib['url'], token_attrib['auth'])
+
+ return {
+ 'id': video_id,
+ 'is_live': True,
+ 'title': title,
+ 'url': video_url,
+ 'uploader': uploader,
+ 'categories': categories,
+ 'ext': 'mp4',
+ }
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
index 8d9491f23..1dfe7f77f 100644
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -52,7 +52,7 @@ class LifeNewsIE(InfoExtractor):
r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex(
- r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False)
+ r'<time datetime=\'([^\']+)\'>', webpage, 'upload date', fatal=False)
if upload_date is not None:
upload_date = unified_strdate(upload_date)
@@ -71,4 +71,4 @@ class LifeNewsIE(InfoExtractor):
if len(videos) == 1:
return make_entry(video_id, videos[0])
else:
- return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)] \ No newline at end of file
+ return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index 8e50e8f79..35822067f 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -8,21 +8,20 @@ from ..utils import int_or_none
class LiveLeakIE(InfoExtractor):
- _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
+ _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)'
_TESTS = [{
'url': 'http://www.liveleak.com/view?i=757_1364311680',
- 'md5': '0813c2430bea7a46bf13acf3406992f4',
+ 'md5': '50f79e05ba149149c1b4ea961223d5b3',
'info_dict': {
'id': '757_1364311680',
- 'ext': 'mp4',
+ 'ext': 'flv',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
'title': 'Most unlucky car accident'
}
- },
- {
+ }, {
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
- 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
+ 'md5': 'b13a29626183c9d33944e6a04f41aafc',
'info_dict': {
'id': 'f93_1390833151',
'ext': 'mp4',
@@ -30,8 +29,7 @@ class LiveLeakIE(InfoExtractor):
'uploader': 'ARD_Stinkt',
'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
}
- },
- {
+ }, {
'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
'md5': '42c6d97d54f1db107958760788c5f48f',
'info_dict': {
@@ -45,8 +43,7 @@ class LiveLeakIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
@@ -83,9 +80,19 @@ class LiveLeakIE(InfoExtractor):
sources = json.loads(sources_json)
formats = [{
+ 'format_id': '%s' % i,
'format_note': s.get('label'),
'url': s['file'],
- } for s in sources]
+ } for i, s in enumerate(sources)]
+ for i, s in enumerate(sources):
+ orig_url = s['file'].replace('.h264_base.mp4', '')
+ if s['file'] != orig_url:
+ formats.append({
+ 'format_id': 'original-%s' % i,
+ 'format_note': s.get('label'),
+ 'url': orig_url,
+ 'preference': 1,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 516147417..5247c6f58 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -4,10 +4,12 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urllib_parse_urlparse,
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
find_xpath_attr,
int_or_none,
@@ -18,7 +20,7 @@ from ..utils import (
class LivestreamIE(InfoExtractor):
IE_NAME = 'livestream'
- _VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
+ _VALID_URL = r'https?://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>[0-9]+)(?:/player)?)?/?(?:$|[?#])'
_TESTS = [{
'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
'md5': '53274c76ba7754fb0e8d072716f2292b',
@@ -37,6 +39,9 @@ class LivestreamIE(InfoExtractor):
'title': 'TEDCity2.0 (English)',
},
'playlist_mincount': 4,
+ }, {
+ 'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640',
+ 'only_matching': True,
}]
def _parse_smil(self, video_id, smil_url):
@@ -190,7 +195,8 @@ class LivestreamOriginalIE(InfoExtractor):
'id': video_id,
'title': item.find('title').text,
'url': 'rtmp://extondemand.livestream.com/ondemand',
- 'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path),
+ 'play_path': 'trans/dv15/mogulus-{0}'.format(path),
+ 'player_url': 'http://static.livestream.com/chromelessPlayer/v21/playerapi.swf?hash=5uetk&v=0803&classid=D27CDB6E-AE6D-11cf-96B8-444553540000&jsEnabled=false&wmode=opaque',
'ext': 'flv',
'thumbnail': thumbnail_url,
}
diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py
new file mode 100644
index 000000000..a8e357859
--- /dev/null
+++ b/youtube_dl/extractor/lnkgo.py
@@ -0,0 +1,124 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ unified_strdate,
+)
+
+
+class LnkGoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?lnkgo\.alfa\.lt/visi\-video/(?P<show>[^/]+)/ziurek\-(?P<display_id>[A-Za-z0-9\-]+)'
+ _TESTS = [{
+ 'url': 'http://lnkgo.alfa.lt/visi-video/yra-kaip-yra/ziurek-yra-kaip-yra-162',
+ 'info_dict': {
+ 'id': '46712',
+ 'ext': 'mp4',
+ 'title': 'Yra kaip yra',
+ 'upload_date': '20150107',
+ 'description': 'md5:d82a5e36b775b7048617f263a0e3475e',
+ 'age_limit': 7,
+ 'duration': 3019,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ 'params': {
+ 'skip_download': True, # HLS download
+ },
+ }, {
+ 'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2',
+ 'info_dict': {
+ 'id': '47289',
+ 'ext': 'mp4',
+ 'title': 'Nėrdas: Kompiuterio Valymas',
+ 'upload_date': '20150113',
+ 'description': 'md5:7352d113a242a808676ff17e69db6a69',
+ 'age_limit': 18,
+ 'duration': 346,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ 'params': {
+ 'skip_download': True, # HLS download
+ },
+ }]
+ _AGE_LIMITS = {
+ 'N-7': 7,
+ 'N-14': 14,
+ 'S': 18,
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(
+ url, display_id, 'Downloading player webpage')
+
+ video_id = self._search_regex(
+ r'data-ep="([^"]+)"', webpage, 'video ID')
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+
+ thumbnail_w = int_or_none(
+ self._og_search_property('image:width', webpage, 'thumbnail width', fatal=False))
+ thumbnail_h = int_or_none(
+ self._og_search_property('image:height', webpage, 'thumbnail height', fatal=False))
+ thumbnail = {
+ 'url': self._og_search_thumbnail(webpage),
+ }
+ if thumbnail_w and thumbnail_h:
+ thumbnail.update({
+ 'width': thumbnail_w,
+ 'height': thumbnail_h,
+ })
+
+ upload_date = unified_strdate(self._search_regex(
+ r'class="meta-item\sair-time">.*?<strong>([^<]+)</strong>', webpage, 'upload date', fatal=False))
+ duration = int_or_none(self._search_regex(
+ r'VideoDuration = "([^"]+)"', webpage, 'duration', fatal=False))
+
+ pg_rating = self._search_regex(
+ r'pgrating="([^"]+)"', webpage, 'PG rating', fatal=False, default='')
+ age_limit = self._AGE_LIMITS.get(pg_rating.upper(), 0)
+
+ sources_js = self._search_regex(
+ r'(?s)sources:\s(\[.*?\]),', webpage, 'sources')
+ sources = self._parse_json(
+ sources_js, video_id, transform_source=js_to_json)
+
+ formats = []
+ for source in sources:
+ if source.get('provider') == 'rtmp':
+ m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<play_path>.+)$', source['file'])
+ if not m:
+ continue
+ formats.append({
+ 'format_id': 'rtmp',
+ 'ext': 'flv',
+ 'url': m.group('url'),
+ 'play_path': m.group('play_path'),
+ 'page_url': url,
+ })
+ elif source.get('file').endswith('.m3u8'):
+ formats.append({
+ 'format_id': 'hls',
+ 'ext': source.get('type', 'mp4'),
+ 'url': source['file'],
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': [thumbnail],
+ 'duration': duration,
+ 'description': description,
+ 'age_limit': age_limit,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
index fca0bfef0..9c2fbdd96 100644
--- a/youtube_dl/extractor/lrt.py
+++ b/youtube_dl/extractor/lrt.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import (
@@ -22,19 +21,16 @@ class LRTIE(InfoExtractor):
'id': '54391',
'ext': 'mp4',
'title': 'Septynios Kauno dienos',
- 'description': 'Kauno miesto ir apskrities naujienos',
+ 'description': 'md5:24d84534c7dc76581e59f5689462411a',
'duration': 1783,
},
'params': {
'skip_download': True, # HLS download
},
-
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = remove_end(self._og_search_title(webpage), ' - LRT')
@@ -46,7 +42,9 @@ class LRTIE(InfoExtractor):
formats = []
for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage):
- data = json.loads(js_to_json(js))
+ data = self._parse_json(js, video_id, transform_source=js_to_json)
+ if 'provider' not in data:
+ continue
if data['provider'] == 'rtmp':
formats.append({
'format_id': 'rtmp',
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index 33f34f4e9..26e84970d 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -5,12 +5,14 @@ import json
from .subtitles import SubtitlesInfoExtractor
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_str,
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
int_or_none,
- compat_str,
)
@@ -45,7 +47,7 @@ class LyndaIE(SubtitlesInfoExtractor):
video_id = mobj.group(1)
page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
- 'Downloading video JSON')
+ 'Downloading video JSON')
video_json = json.loads(page)
if 'Status' in video_json:
@@ -109,7 +111,7 @@ class LyndaIE(SubtitlesInfoExtractor):
'password': password,
'remember': 'false',
'stayPut': 'false'
- }
+ }
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
@@ -117,7 +119,7 @@ class LyndaIE(SubtitlesInfoExtractor):
m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
if m is not None:
response = m.group('json')
- response_json = json.loads(response)
+ response_json = json.loads(response)
state = response_json['state']
if state == 'notlogged':
@@ -187,7 +189,7 @@ class LyndaCourseIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
course_path = mobj.group('coursepath')
course_id = mobj.group('courseid')
-
+
page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
course_id, 'Downloading course JSON')
course_json = json.loads(page)
@@ -221,4 +223,4 @@ class LyndaCourseIE(InfoExtractor):
course_title = course_json['Title']
- return self.playlist_result(entries, course_id, course_title) \ No newline at end of file
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py
index 1a26b5d57..7e025831b 100644
--- a/youtube_dl/extractor/m6.py
+++ b/youtube_dl/extractor/m6.py
@@ -27,7 +27,7 @@ class M6IE(InfoExtractor):
video_id = mobj.group('id')
rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
- 'Downloading video RSS')
+ 'Downloading video RSS')
title = rss.find('./channel/item/title').text
description = rss.find('./channel/item/description').text
@@ -53,4 +53,4 @@ class M6IE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py
index 7460d81cd..54a14cb94 100644
--- a/youtube_dl/extractor/mailru.py
+++ b/youtube_dl/extractor/mailru.py
@@ -16,7 +16,7 @@ class MailRuIE(InfoExtractor):
'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
'md5': 'dea205f03120046894db4ebb6159879a',
'info_dict': {
- 'id': '46301138',
+ 'id': '46301138_76',
'ext': 'mp4',
'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
'timestamp': 1393232740,
@@ -30,7 +30,7 @@ class MailRuIE(InfoExtractor):
'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
'md5': '00a91a58c3402204dcced523777b475f',
'info_dict': {
- 'id': '46843144',
+ 'id': '46843144_1263',
'ext': 'mp4',
'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
'timestamp': 1397217632,
@@ -54,33 +54,36 @@ class MailRuIE(InfoExtractor):
author = video_data['author']
uploader = author['name']
- uploader_id = author['id']
+ uploader_id = author.get('id') or author.get('email')
+ view_count = video_data.get('views_count')
- movie = video_data['movie']
- content_id = str(movie['contentId'])
- title = movie['title']
+ meta_data = video_data['meta']
+ content_id = '%s_%s' % (
+ meta_data.get('accId', ''), meta_data['itemId'])
+ title = meta_data['title']
if title.endswith('.mp4'):
title = title[:-4]
- thumbnail = movie['poster']
- duration = movie['duration']
-
- view_count = video_data['views_count']
+ thumbnail = meta_data['poster']
+ duration = meta_data['duration']
+ timestamp = meta_data['timestamp']
formats = [
{
'url': video['url'],
- 'format_id': video['name'],
+ 'format_id': video['key'],
+ 'height': int(video['key'].rstrip('p'))
} for video in video_data['videos']
]
+ self._sort_formats(formats)
return {
'id': content_id,
'title': title,
'thumbnail': thumbnail,
- 'timestamp': video_data['timestamp'],
+ 'timestamp': timestamp,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
'view_count': view_count,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py
index 8c1966ab2..0b85a59d1 100644
--- a/youtube_dl/extractor/malemotion.py
+++ b/youtube_dl/extractor/malemotion.py
@@ -1,42 +1,33 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
+
class MalemotionIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)'
+ _VALID_URL = r'https?://malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)'
_TEST = {
- 'url': 'http://malemotion.com/video/bien-dur.10ew',
- 'file': '10ew.mp4',
- 'md5': 'b3cc49f953b107e4a363cdff07d100ce',
+ 'url': 'http://malemotion.com/video/bete-de-concours.ltc',
+ 'md5': '3013e53a0afbde2878bc39998c33e8a5',
'info_dict': {
- "title": "Bien dur",
- "age_limit": 18,
+ 'id': 'ltc',
+ 'ext': 'mp4',
+ 'title': 'Bête de Concours',
+ 'age_limit': 18,
},
- 'skip': 'This video has been deleted.'
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group("id")
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
- # Extract video URL
- video_url = compat_urllib_parse.unquote(
- self._search_regex(r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
-
- # Extract title
+ video_url = compat_urllib_parse.unquote(self._search_regex(
+ r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
video_title = self._html_search_regex(
r'<title>(.*?)</title', webpage, 'title')
-
- # Extract video thumbnail
video_thumbnail = self._search_regex(
r'<video .+?poster="(.+?)"', webpage, 'thumbnail', fatal=False)
@@ -46,14 +37,12 @@ class MalemotionIE(InfoExtractor):
'format_id': 'mp4',
'preference': 1,
}]
+ self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
- 'uploader': None,
- 'upload_date': None,
'title': video_title,
'thumbnail': video_thumbnail,
- 'description': None,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 1b8c4a32e..5fdd19027 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class MDRIE(InfoExtractor):
_VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
-
+
# No tests, MDR regularily deletes its videos
_TEST = {
'url': 'http://www.mdr.de/fakt/video189002.html',
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 1a896b536..8bc333b02 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -3,10 +3,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_parse_qs,
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
@@ -22,7 +24,7 @@ class MetacafeIE(InfoExtractor):
# Youtube video
{
'add_ie': ['Youtube'],
- 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
+ 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
'info_dict': {
'id': '_aUehQsCQtM',
'ext': 'mp4',
@@ -219,8 +221,8 @@ class MetacafeIE(InfoExtractor):
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(
- r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
- webpage, 'uploader nickname', fatal=False)
+ r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
+ webpage, 'uploader nickname', fatal=False)
duration = int_or_none(
self._html_search_meta('video:duration', webpage))
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index 07f072924..e30320569 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -28,7 +28,7 @@ class MetacriticIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
# The xml is not well formatted, there are raw '&'
info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
- video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
+ video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
formats = []
@@ -44,7 +44,7 @@ class MetacriticIE(InfoExtractor):
self._sort_formats(formats)
description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
- webpage, 'description', flags=re.DOTALL)
+ webpage, 'description', flags=re.DOTALL)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py
new file mode 100644
index 000000000..14934b7ec
--- /dev/null
+++ b/youtube_dl/extractor/minhateca.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_filesize,
+)
+
+
+class MinhatecaIE(InfoExtractor):
+ _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
+ _TEST = {
+ 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
+ 'info_dict': {
+ 'id': '125848331',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'filesize_approx': 1530000,
+ 'duration': 9,
+ 'view_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ token = self._html_search_regex(
+ r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
+ webpage, 'request token')
+ token_data = [
+ ('fileId', video_id),
+ ('__RequestVerificationToken', token),
+ ]
+ req = compat_urllib_request.Request(
+ 'http://minhateca.com.br/action/License/Download',
+ data=compat_urllib_parse.urlencode(token_data))
+ req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ data = self._download_json(
+ req, video_id, note='Downloading metadata')
+
+ video_url = data['redirectUrl']
+ title_str = self._html_search_regex(
+ r'<h1.*?>(.*?)</h1>', webpage, 'title')
+ title, _, ext = title_str.rpartition('.')
+ filesize_approx = parse_filesize(self._html_search_regex(
+ r'<p class="fileSize">(.*?)</p>',
+ webpage, 'file size approximation', fatal=False))
+ duration = parse_duration(self._html_search_regex(
+ r'(?s)<p class="fileLeng[ht][th]">.*?class="bold">(.*?)<',
+ webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'<p class="downloadsCounter">([0-9]+)</p>',
+ webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'ext': ext,
+ 'filesize_approx': filesize_approx,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 807b1dc89..3c61a850f 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -5,8 +5,10 @@ import json
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..utils import (
+from ..compat import (
compat_urlparse,
+)
+from ..utils import (
clean_html,
ExtractorError,
get_element_by_id,
@@ -15,7 +17,7 @@ from ..utils import (
class TechTVMITIE(InfoExtractor):
IE_NAME = 'techtv.mit.edu'
- _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
+ _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)'
_TEST = {
'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
@@ -29,8 +31,7 @@ class TechTVMITIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
raw_page = self._download_webpage(
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
@@ -104,7 +105,10 @@ class OCWMITIE(InfoExtractor):
'ext': 'mp4',
'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
- #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
+ 'upload_date': '20121109',
+ 'uploader_id': 'MIT',
+ 'uploader': 'MIT OpenCourseWare',
+ # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
}
},
{
@@ -113,8 +117,11 @@ class OCWMITIE(InfoExtractor):
'id': '7K1sB05pE0A',
'ext': 'mp4',
'title': 'Session 1: Introduction to Derivatives',
+ 'upload_date': '20090818',
+ 'uploader_id': 'MIT',
+ 'uploader': 'MIT OpenCourseWare',
'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
- #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
+ # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
}
}
]
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 979f3d692..256758323 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -1,11 +1,13 @@
from __future__ import unicode_literals
-import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+ compat_urlparse,
+)
+from ..utils import (
get_element_by_attribute,
parse_duration,
strip_jsonp,
@@ -14,7 +16,7 @@ from ..utils import (
class MiTeleIE(InfoExtractor):
IE_NAME = 'mitele.es'
- _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<episode>[^/]+)/'
+ _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
@@ -30,22 +32,28 @@ class MiTeleIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- episode = mobj.group('episode')
+ episode = self._match_id(url)
webpage = self._download_webpage(url, episode)
embed_data_json = self._search_regex(
- r'MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
- flags=re.DOTALL
+ r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
).replace('\'', '"')
embed_data = json.loads(embed_data_json)
- info_url = embed_data['flashvars']['host']
+ domain = embed_data['mediaUrl']
+ if not domain.startswith('http'):
+ # only happens in telecinco.es videos
+ domain = 'http://' + domain
+ info_url = compat_urlparse.urljoin(
+ domain,
+ compat_urllib_parse.unquote(embed_data['flashvars']['host'])
+ )
info_el = self._download_xml(info_url, episode).find('./video/info')
video_link = info_el.find('videoUrl/link').text
token_query = compat_urllib_parse.urlencode({'id': video_link})
token_info = self._download_json(
- 'http://token.mitele.es/?' + token_query, episode,
+ embed_data['flashvars']['ov_tk'] + '?' + token_query,
+ episode,
transform_source=strip_jsonp
)
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index bb8937c4d..07d194562 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
HEADRequest,
int_or_none,
@@ -70,7 +72,7 @@ class MixcloudIE(InfoExtractor):
raise ExtractorError('Unable to extract track url')
PREFIX = (
- r'<div class="cloudcast-play-button-container[^"]*?"'
+ r'<span class="play-button[^"]*?"'
r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
title = self._html_search_regex(
PREFIX + r'm-title="([^"]+)"', webpage, 'title')
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index 42aa2e227..1a241aca7 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -10,7 +10,7 @@ from ..utils import (
class MLBIE(InfoExtractor):
- _VALID_URL = r'https?://m\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|shared/video/embed/embed\.html\?.*?\bcontent_id=)(?P<id>n?\d+)'
+ _VALID_URL = r'https?://m(?:lb)?\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'
_TESTS = [
{
'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea',
@@ -72,6 +72,14 @@ class MLBIE(InfoExtractor):
'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
'only_matching': True,
},
+ {
+ 'url': 'http://mlb.mlb.com/shared/video/embed/embed.html?content_id=36599553',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py
index 2ff79b9b8..5a66302f6 100644
--- a/youtube_dl/extractor/moevideo.py
+++ b/youtube_dl/extractor/moevideo.py
@@ -5,10 +5,12 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
int_or_none,
)
@@ -50,7 +52,8 @@ class MoeVideoIE(InfoExtractor):
'height': 296,
'duration': 6027,
'filesize': 588257923,
- }
+ },
+ 'skip': 'Video has been removed',
},
]
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py
index d658647e6..2cec12d35 100644
--- a/youtube_dl/extractor/mofosex.py
+++ b/youtube_dl/extractor/mofosex.py
@@ -4,7 +4,7 @@ import os
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
@@ -12,7 +12,7 @@ from ..utils import (
class MofosexIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>mofosex\.com/videos/(?P<id>[0-9]+)/.*?\.html)'
_TEST = {
'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
'md5': '1b2eb47ac33cc75d4a80e3026b613c5a',
@@ -26,7 +26,7 @@ class MofosexIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = mobj.group('id')
url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
diff --git a/youtube_dl/extractor/mojvideo.py b/youtube_dl/extractor/mojvideo.py
index 90b460d65..0ba435dc5 100644
--- a/youtube_dl/extractor/mojvideo.py
+++ b/youtube_dl/extractor/mojvideo.py
@@ -55,4 +55,4 @@ class MojvideoIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
'duration': duration,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py
index 79bb2ca59..5de719bdc 100644
--- a/youtube_dl/extractor/moniker.py
+++ b/youtube_dl/extractor/moniker.py
@@ -5,7 +5,7 @@ import os.path
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -37,10 +37,9 @@ class MonikerIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
orig_webpage = self._download_webpage(url, video_id)
+
fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
data = dict(fields)
@@ -54,7 +53,7 @@ class MonikerIE(InfoExtractor):
title = os.path.splitext(data['fname'])[0]
- #Could be several links with different quality
+ # Could be several links with different quality
links = re.findall(r'"file" : "?(.+?)",', webpage)
# Assume the links are ordered in quality
formats = [{
diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py
index 7d21ea18f..7603af5e2 100644
--- a/youtube_dl/extractor/mooshare.py
+++ b/youtube_dl/extractor/mooshare.py
@@ -1,14 +1,15 @@
from __future__ import unicode_literals
import re
-import time
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
)
+from ..utils import (
+ ExtractorError,
+)
class MooshareIE(InfoExtractor):
@@ -43,13 +44,11 @@ class MooshareIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
page = self._download_webpage(url, video_id, 'Downloading page')
if re.search(r'>Video Not Found or Deleted<', page) is not None:
- raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
hash_key = self._html_search_regex(r'<input type="hidden" name="hash" value="([^"]+)">', page, 'hash')
title = self._html_search_regex(r'(?m)<div class="blockTitle">\s*<h2>Watch ([^<]+)</h2>', page, 'title')
@@ -64,8 +63,7 @@ class MooshareIE(InfoExtractor):
'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- self.to_screen('%s: Waiting for timeout' % video_id)
- time.sleep(5)
+ self._sleep(5, video_id)
video_page = self._download_webpage(request, video_id, 'Downloading video page')
@@ -111,4 +109,4 @@ class MooshareIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
index 6229b2173..97d5da626 100644
--- a/youtube_dl/extractor/motherless.py
+++ b/youtube_dl/extractor/motherless.py
@@ -5,20 +5,20 @@ import re
from .common import InfoExtractor
from ..utils import (
- int_or_none,
+ str_to_int,
unified_strdate,
)
class MotherlessIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)'
+ _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
_TESTS = [
{
'url': 'http://motherless.com/AC3FFE1',
- 'md5': '5527fef81d2e529215dad3c2d744a7d9',
+ 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
'info_dict': {
'id': 'AC3FFE1',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Fucked in the ass while playing PS3',
'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
'upload_date': '20100913',
@@ -40,33 +40,51 @@ class MotherlessIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
'age_limit': 18,
}
+ },
+ {
+ 'url': 'http://motherless.com/g/cosplay/633979F',
+ 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
+ 'info_dict': {
+ 'id': '633979F',
+ 'ext': 'mp4',
+ 'title': 'Turtlette',
+ 'categories': ['superheroine heroine superher'],
+ 'upload_date': '20140827',
+ 'uploader_id': 'shade0230',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
+ }
}
]
- def _real_extract(self,url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
-
- video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url')
+ title = self._html_search_regex(
+ r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
+ video_url = self._html_search_regex(
+ r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL')
age_limit = self._rta_search(webpage)
+ view_count = str_to_int(self._html_search_regex(
+ r'<strong>Views</strong>\s+([^<]+)<',
+ webpage, 'view count', fatal=False))
+ like_count = str_to_int(self._html_search_regex(
+ r'<strong>Favorited</strong>\s+([^<]+)<',
+ webpage, 'like count', fatal=False))
- view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count')
-
- upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date')
+ upload_date = self._html_search_regex(
+ r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date')
if 'Ago' in upload_date:
days = int(re.search(r'([0-9]+)', upload_date).group(1))
upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')
else:
upload_date = unified_strdate(upload_date)
- like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count')
-
comment_count = webpage.count('class="media-comment-contents"')
- uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id')
+ uploader_id = self._html_search_regex(
+ r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
+ webpage, 'uploader_id')
categories = self._html_search_meta('keywords', webpage)
if categories:
@@ -79,8 +97,8 @@ class MotherlessIE(InfoExtractor):
'uploader_id': uploader_id,
'thumbnail': self._og_search_thumbnail(webpage),
'categories': categories,
- 'view_count': int_or_none(view_count.replace(',', '')),
- 'like_count': int_or_none(like_count.replace(',', '')),
+ 'view_count': view_count,
+ 'like_count': like_count,
'comment_count': comment_count,
'age_limit': age_limit,
'url': video_url,
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py
index 7c0ec6a12..c1a482dba 100644
--- a/youtube_dl/extractor/motorsport.py
+++ b/youtube_dl/extractor/motorsport.py
@@ -1,63 +1,49 @@
# coding: utf-8
from __future__ import unicode_literals
-import hashlib
-import json
-import re
-import time
-
from .common import InfoExtractor
-from ..utils import (
- compat_parse_qs,
- compat_str,
- int_or_none,
+from ..compat import (
+ compat_urlparse,
)
class MotorsportIE(InfoExtractor):
IE_DESC = 'motorsport.com'
- _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])'
+ _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])'
_TEST = {
'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
- 'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',
'info_dict': {
- 'id': '7063',
+ 'id': '2-T3WuR-KMM',
'ext': 'mp4',
'title': 'Red Bull Racing: 2014 Rules Explained',
- 'duration': 207,
+ 'duration': 208,
'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.',
- 'uploader': 'rainiere',
- 'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$'
- }
+ 'uploader': 'mcomstaff',
+ 'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ',
+ 'upload_date': '20140903',
+ 'thumbnail': r're:^https?://.+\.jpg$'
+ },
+ 'add_ie': ['Youtube'],
+ 'params': {
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
-
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- flashvars_code = self._html_search_regex(
- r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars')
- flashvars = compat_parse_qs(flashvars_code)
- params = json.loads(flashvars['parameters'][0])
-
- e = compat_str(int(time.time()) + 24 * 60 * 60)
- base_video_url = params['location'] + '?e=' + e
- s = 'h3hg713fh32'
- h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest()
- video_url = base_video_url + '&h=' + h
- uploader = self._html_search_regex(
- r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage,
- 'uploader', fatal=False)
+ iframe_path = self._html_search_regex(
+ r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage,
+ 'iframe path')
+ iframe = self._download_webpage(
+ compat_urlparse.urljoin(url, iframe_path), display_id,
+ 'Downloading iframe')
+ youtube_id = self._search_regex(
+ r'www.youtube.com/embed/(.{11})', iframe, 'youtube id')
return {
- 'id': params['video_id'],
+ '_type': 'url_transparent',
'display_id': display_id,
- 'title': params['title'],
- 'url': video_url,
- 'description': params.get('description'),
- 'thumbnail': params.get('main_thumb'),
- 'duration': int_or_none(params.get('duration')),
- 'uploader': uploader,
+ 'url': 'https://youtube.com/watch?v=%s' % youtube_id,
}
diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py
index 456807dd1..04e17d055 100644
--- a/youtube_dl/extractor/movieclips.py
+++ b/youtube_dl/extractor/movieclips.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
from ..utils import (
ExtractorError,
- compat_str,
clean_html,
)
diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py
index 43146180a..f130b75c4 100644
--- a/youtube_dl/extractor/moviezine.py
+++ b/youtube_dl/extractor/moviezine.py
@@ -27,7 +27,7 @@ class MoviezineIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player')
- formats =[{
+ formats = [{
'format_id': 'sd',
'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'),
'quality': 0,
diff --git a/youtube_dl/extractor/movshare.py b/youtube_dl/extractor/movshare.py
index 4191cf7a0..6101063f2 100644
--- a/youtube_dl/extractor/movshare.py
+++ b/youtube_dl/extractor/movshare.py
@@ -24,4 +24,4 @@ class MovShareIE(NovaMovIE):
'title': 'dissapeared image',
'description': 'optical illusion dissapeared image magic illusion',
}
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py
index 387935d4d..88c9501cd 100644
--- a/youtube_dl/extractor/mpora.py
+++ b/youtube_dl/extractor/mpora.py
@@ -44,7 +44,7 @@ class MporaIE(InfoExtractor):
r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'],
False, default=None)
vcodec = src['type'].partition('/')[2]
-
+
formats.append({
'format_id': encoding_id + '-' + vcodec,
'url': src['src'],
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 228b42d2b..22a726327 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
@@ -33,7 +35,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
if not m:
return rtmp_video_url
- base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+ base = 'http://viacommtvstrmfs.fplive.net/'
return base + m.group('finalid')
def _get_feed_url(self, uri):
@@ -51,25 +53,25 @@ class MTVServicesInfoExtractor(InfoExtractor):
webpage_url = self._MOBILE_TEMPLATE % mtvn_id
req = compat_urllib_request.Request(webpage_url)
# Otherwise we get a webpage that would execute some javascript
- req.add_header('Youtubedl-user-agent', 'curl/7')
+ req.add_header('User-Agent', 'curl/7')
webpage = self._download_webpage(req, mtvn_id,
- 'Downloading mobile page')
+ 'Downloading mobile page')
metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
req = HEADRequest(metrics_url)
response = self._request_webpage(req, mtvn_id, 'Resolving url')
url = response.geturl()
# Transform the url to get the best quality:
url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
- return [{'url': url,'ext': 'mp4'}]
+ return [{'url': url, 'ext': 'mp4'}]
def _extract_video_formats(self, mdoc, mtvn_id):
if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None:
if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
self.to_screen('The normal version is not available from your '
- 'country, trying with the mobile version')
+ 'country, trying with the mobile version')
return self._extract_mobile_video_formats(mtvn_id)
raise ExtractorError('This video is not available from your country.',
- expected=True)
+ expected=True)
formats = []
for rendition in mdoc.findall('.//rendition'):
@@ -98,7 +100,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
mediagen_url += '&acceptMethods=fms'
mediagen_doc = self._download_xml(mediagen_url, video_id,
- 'Downloading video urls')
+ 'Downloading video urls')
description_node = itemdoc.find('description')
if description_node is not None:
@@ -126,7 +128,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
# This a short id that's used in the webpage urls
mtvn_id = None
mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category',
- 'scheme', 'urn:mtvn:id')
+ 'scheme', 'urn:mtvn:id')
if mtvn_id_node is not None:
mtvn_id = mtvn_id_node.text
@@ -145,7 +147,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
idoc = self._download_xml(
feed_url + '?' + data, video_id,
'Downloading info', transform_source=fix_xml_ampersands)
- return [self._get_video_info(item) for item in idoc.findall('.//item')]
+ return self.playlist_result(
+ [self._get_video_info(item) for item in idoc.findall('.//item')])
def _real_extract(self, url):
title = url_basename(url)
@@ -163,7 +166,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
if mgid is None or ':' not in mgid:
mgid = self._search_regex(
[r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'],
- webpage, u'mgid')
+ webpage, 'mgid')
return self._get_videos_info(mgid)
@@ -186,7 +189,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
def _get_feed_url(self, uri):
video_id = self._id_from_uri(uri)
site_id = uri.replace(video_id, '')
- config_url = 'http://media.mtvnservices.com/pmt/e1/players/{0}/config.xml'.format(site_id)
+ config_url = ('http://media.mtvnservices.com/pmt/e1/players/{0}/'
+ 'context4/context5/config.xml'.format(site_id))
config_doc = self._download_xml(config_url, video_id)
feed_node = config_doc.find('.//feed')
feed_url = feed_node.text.strip().split('?')[0]
@@ -238,15 +242,15 @@ class MTVIE(MTVServicesInfoExtractor):
uri = mobj.groupdict().get('mgid')
if uri is None:
webpage = self._download_webpage(url, video_id)
-
+
# Some videos come from Vevo.com
m_vevo = re.search(r'isVevoVideo = true;.*?vevoVideoId = "(.*?)";',
webpage, re.DOTALL)
if m_vevo:
- vevo_id = m_vevo.group(1);
+ vevo_id = m_vevo.group(1)
self.to_screen('Vevo video detected: %s' % vevo_id)
return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
-
+
uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri')
return self._get_videos_info(uri)
diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py
index c7f6beb9c..b4e8ad17e 100644
--- a/youtube_dl/extractor/muenchentv.py
+++ b/youtube_dl/extractor/muenchentv.py
@@ -73,4 +73,3 @@ class MuenchenTVIE(InfoExtractor):
'is_live': True,
'thumbnail': thumbnail,
}
-
diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py
index 42d7a82a5..50d92b50a 100644
--- a/youtube_dl/extractor/musicplayon.py
+++ b/youtube_dl/extractor/musicplayon.py
@@ -72,4 +72,4 @@ class MusicPlayOnIE(InfoExtractor):
'duration': int_or_none(duration),
'view_count': int_or_none(view_count),
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py
index 1772b7f9a..1e9cf8de9 100644
--- a/youtube_dl/extractor/muzu.py
+++ b/youtube_dl/extractor/muzu.py
@@ -1,64 +1,65 @@
-import re
-import json
+from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
- determine_ext,
)
class MuzuTVIE(InfoExtractor):
_VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)'
- IE_NAME = u'muzu.tv'
+ IE_NAME = 'muzu.tv'
_TEST = {
- u'url': u'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/',
- u'file': u'1981454.mp4',
- u'md5': u'98f8b2c7bc50578d6a0364fff2bfb000',
- u'info_dict': {
- u'title': u'Cat Walk (Original Mix)',
- u'description': u'md5:90e868994de201b2570e4e5854e19420',
- u'uploader': u'MarcAshken featuring SOS',
+ 'url': 'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/',
+ 'md5': '98f8b2c7bc50578d6a0364fff2bfb000',
+ 'info_dict': {
+ 'id': '1981454',
+ 'ext': 'mp4',
+ 'title': 'Cat Walk (Original Mix)',
+ 'description': 'md5:90e868994de201b2570e4e5854e19420',
+ 'uploader': 'MarcAshken featuring SOS',
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
- info_data = compat_urllib_parse.urlencode({'format': 'json',
- 'url': url,
- })
- video_info_page = self._download_webpage('http://www.muzu.tv/api/oembed/?%s' % info_data,
- video_id, u'Downloading video info')
- info = json.loads(video_info_page)
+ info_data = compat_urllib_parse.urlencode({
+ 'format': 'json',
+ 'url': url,
+ })
+ info = self._download_json(
+ 'http://www.muzu.tv/api/oembed/?%s' % info_data,
+ video_id, 'Downloading video info')
- player_info_page = self._download_webpage('http://player.muzu.tv/player/playerInit?ai=%s' % video_id,
- video_id, u'Downloading player info')
- video_info = json.loads(player_info_page)['videos'][0]
- for quality in ['1080' , '720', '480', '360']:
+ player_info = self._download_json(
+ 'http://player.muzu.tv/player/playerInit?ai=%s' % video_id,
+ video_id, 'Downloading player info')
+ video_info = player_info['videos'][0]
+ for quality in ['1080', '720', '480', '360']:
if video_info.get('v%s' % quality):
break
- data = compat_urllib_parse.urlencode({'ai': video_id,
- # Even if each time you watch a video the hash changes,
- # it seems to work for different videos, and it will work
- # even if you use any non empty string as a hash
- 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k',
- 'device': 'web',
- 'qv': quality,
- })
- video_url_page = self._download_webpage('http://player.muzu.tv/player/requestVideo?%s' % data,
- video_id, u'Downloading video url')
- video_url_info = json.loads(video_url_page)
+ data = compat_urllib_parse.urlencode({
+ 'ai': video_id,
+ # Even if each time you watch a video the hash changes,
+ # it seems to work for different videos, and it will work
+ # even if you use any non empty string as a hash
+ 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k',
+ 'device': 'web',
+ 'qv': quality,
+ })
+ video_url_info = self._download_json(
+ 'http://player.muzu.tv/player/requestVideo?%s' % data,
+ video_id, 'Downloading video url')
video_url = video_url_info['url']
- return {'id': video_id,
- 'title': info['title'],
- 'url': video_url,
- 'ext': determine_ext(video_url),
- 'thumbnail': info['thumbnail_url'],
- 'description': info['description'],
- 'uploader': info['author_name'],
- }
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'url': video_url,
+ 'thumbnail': info['thumbnail_url'],
+ 'description': info['description'],
+ 'uploader': info['author_name'],
+ }
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py
index c16939f54..83414a232 100644
--- a/youtube_dl/extractor/myspace.py
+++ b/youtube_dl/extractor/myspace.py
@@ -1,12 +1,14 @@
+# encoding: utf-8
from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
)
+from ..utils import ExtractorError
class MySpaceIE(InfoExtractor):
@@ -14,33 +16,58 @@ class MySpaceIE(InfoExtractor):
_TESTS = [
{
- 'url': 'https://myspace.com/coldplay/video/viva-la-vida/100008689',
+ 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
'info_dict': {
- 'id': '100008689',
+ 'id': '109594919',
'ext': 'flv',
- 'title': 'Viva La Vida',
- 'description': 'The official Viva La Vida video, directed by Hype Williams',
- 'uploader': 'Coldplay',
- 'uploader_id': 'coldplay',
+ 'title': 'Little Big Town',
+ 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
+ 'uploader': 'Five Minutes to the Stage',
+ 'uploader_id': 'fiveminutestothestage',
},
'params': {
# rtmp download
'skip_download': True,
},
},
- # song
+ # songs
{
- 'url': 'https://myspace.com/spiderbags/music/song/darkness-in-my-heart-39008454-27041242',
+ 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
'info_dict': {
- 'id': '39008454',
+ 'id': '93388656',
'ext': 'flv',
- 'title': 'Darkness In My Heart',
- 'uploader_id': 'spiderbags',
+ 'title': 'Of weakened soul...',
+ 'uploader': 'Killsorrow',
+ 'uploader_id': 'killsorrow',
},
'params': {
# rtmp download
'skip_download': True,
},
+ }, {
+ 'add_ie': ['Vevo'],
+ 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
+ 'info_dict': {
+ 'id': 'USZM20600099',
+ 'ext': 'mp4',
+ 'title': 'Animal I Have Become',
+ 'uploader': 'Three Days Grace',
+ 'timestamp': int,
+ 'upload_date': '20060502',
+ },
+ 'skip': 'VEVO is only available in some countries',
+ }, {
+ 'add_ie': ['Youtube'],
+ 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
+ 'info_dict': {
+ 'id': 'ypWvQgnJrSU',
+ 'ext': 'mp4',
+ 'title': 'Starset - First Light',
+ 'description': 'md5:2d5db6c9d11d527683bcda818d332414',
+ 'uploader': 'Jacob Soren',
+ 'uploader_id': 'SorenPromotions',
+ 'upload_date': '20140725',
+ }
},
]
@@ -48,22 +75,47 @@ class MySpaceIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
+ player_url = self._search_regex(
+ r'playerSwf":"([^"?]*)', webpage, 'player URL')
if mobj.group('mediatype').startswith('music/song'):
# songs don't store any useful info in the 'context' variable
+ song_data = self._search_regex(
+ r'''<button.*data-song-id=(["\'])%s\1.*''' % video_id,
+ webpage, 'song_data', default=None, group=0)
+ if song_data is None:
+ # some songs in an album are not playable
+ self.report_warning(
+ '%s: No downloadable song on this page' % video_id)
+ return
+
def search_data(name):
- return self._search_regex(r'data-%s="(.*?)"' % name, webpage,
- name)
+ return self._search_regex(
+ r'''data-%s=([\'"])(?P<data>.*?)\1''' % name,
+ song_data, name, default='', group='data')
streamUrl = search_data('stream-url')
+ if not streamUrl:
+ vevo_id = search_data('vevo-id')
+ youtube_id = search_data('youtube-id')
+ if vevo_id:
+ self.to_screen('Vevo video detected: %s' % vevo_id)
+ return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+ elif youtube_id:
+ self.to_screen('Youtube video detected: %s' % youtube_id)
+ return self.url_result(youtube_id, ie='Youtube')
+ else:
+ raise ExtractorError(
+ 'Found song but don\'t know how to download it')
info = {
'id': video_id,
'title': self._og_search_title(webpage),
+ 'uploader': search_data('artist-name'),
'uploader_id': search_data('artist-username'),
'thumbnail': self._og_search_thumbnail(webpage),
}
else:
- context = json.loads(self._search_regex(r'context = ({.*?});', webpage,
- u'context'))
+ context = json.loads(self._search_regex(
+ r'context = ({.*?});', webpage, 'context'))
video = context['video']
streamUrl = video['streamUrl']
info = {
@@ -79,6 +131,50 @@ class MySpaceIE(InfoExtractor):
info.update({
'url': rtmp_url,
'play_path': play_path,
+ 'player_url': player_url,
'ext': 'flv',
})
return info
+
+
+class MySpaceAlbumIE(InfoExtractor):
+ IE_NAME = 'MySpace:album'
+ _VALID_URL = r'https?://myspace\.com/([^/]+)/music/album/(?P<title>.*-)(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://myspace.com/starset2/music/album/transmissions-19455773',
+ 'info_dict': {
+ 'title': 'Transmissions',
+ 'id': '19455773',
+ },
+ 'playlist_count': 14,
+ 'skip': 'this album is only available in some countries',
+ }, {
+ 'url': 'https://myspace.com/killsorrow/music/album/the-demo-18596029',
+ 'info_dict': {
+ 'title': 'The Demo',
+ 'id': '18596029',
+ },
+ 'playlist_count': 5,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ display_id = mobj.group('title') + playlist_id
+ webpage = self._download_webpage(url, display_id)
+ tracks_paths = re.findall(r'"music:song" content="(.*?)"', webpage)
+ if not tracks_paths:
+ raise ExtractorError(
+ '%s: No songs found, try using proxy' % display_id,
+ expected=True)
+ entries = [
+ self.url_result(t_path, ie=MySpaceIE.ie_key())
+ for t_path in tracks_paths]
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'display_id': display_id,
+ 'title': self._og_search_title(webpage),
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index 4fa0575f8..5b9b9fbcd 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -2,9 +2,10 @@ from __future__ import unicode_literals
import os.path
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
-
+)
+from ..utils import (
ExtractorError,
)
@@ -13,9 +14,10 @@ class MySpassIE(InfoExtractor):
_VALID_URL = r'http://www\.myspass\.de/.*'
_TEST = {
'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
- 'file': '11741.mp4',
'md5': '0b49f4844a068f8b33f4b7c88405862b',
'info_dict': {
+ 'id': '11741',
+ 'ext': 'mp4',
"description": "Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?",
"title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2",
},
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index ccb5959c4..5e754fcff 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -7,11 +7,12 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_ord,
compat_urllib_parse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
)
@@ -32,7 +33,7 @@ class MyVideoIE(InfoExtractor):
# Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
# Released into the Public Domain by Tristan Fischer on 2013-05-19
# https://github.com/rg3/youtube-dl/pull/842
- def __rc4crypt(self,data, key):
+ def __rc4crypt(self, data, key):
x = 0
box = list(range(256))
for i in list(range(256)):
@@ -48,17 +49,17 @@ class MyVideoIE(InfoExtractor):
out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
return out
- def __md5(self,s):
+ def __md5(self, s):
return hashlib.md5(s).hexdigest().encode()
- def _real_extract(self,url):
+ def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
GK = (
- b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
- b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
- b'TnpsbA0KTVRkbU1tSTRNdz09'
+ b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
+ b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
+ b'TnpsbA0KTVRkbU1tSTRNdz09'
)
# Get video webpage
@@ -71,7 +72,7 @@ class MyVideoIE(InfoExtractor):
video_url = mobj.group(1) + '.flv'
video_title = self._html_search_regex('<title>([^<]+)</title>',
- webpage, 'title')
+ webpage, 'title')
return {
'id': video_id,
@@ -161,7 +162,7 @@ class MyVideoIE(InfoExtractor):
video_swfobj = compat_urllib_parse.unquote(video_swfobj)
video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
- webpage, 'title')
+ webpage, 'title')
return {
'id': video_id,
@@ -172,4 +173,3 @@ class MyVideoIE(InfoExtractor):
'play_path': video_playpath,
'player_url': video_swfobj,
}
-
diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py
new file mode 100644
index 000000000..a94ab8358
--- /dev/null
+++ b/youtube_dl/extractor/myvidster.py
@@ -0,0 +1,29 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MyVidsterIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/'
+
+ _TEST = {
+ 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making',
+ 'md5': '95296d0231c1363222c3441af62dc4ca',
+ 'info_dict': {
+ 'id': '3685814',
+ 'title': 'md5:7d8427d6d02c4fbcef50fe269980c749',
+ 'upload_date': '20141027',
+ 'uploader_id': 'utkualp',
+ 'ext': 'mp4',
+ 'age_limit': 18,
+ },
+ 'add_ie': ['XHamster'],
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ return self.url_result(self._html_search_regex(
+ r'rel="videolink" href="(?P<real_url>.*)">',
+ webpage, 'real video url'))
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index c0231c197..c10405f04 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -4,9 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
ExtractorError,
+ clean_html,
)
@@ -25,16 +28,21 @@ class NaverIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
- webpage)
+ webpage)
if m_id is None:
+ m_error = re.search(
+ r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+ webpage)
+ if m_error:
+ raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
raise ExtractorError('couldn\'t extract vid and key')
vid = m_id.group(1)
key = m_id.group(2)
- query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,})
+ query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key, })
query_urls = compat_urllib_parse.urlencode({
'masterVid': vid,
'protocol': 'p2p',
@@ -59,7 +67,7 @@ class NaverIE(InfoExtractor):
if domain.startswith('rtmp'):
f.update({
'ext': 'flv',
- 'rtmp_protocol': '1', # rtmpt
+ 'rtmp_protocol': '1', # rtmpt
})
formats.append(f)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 78e650b2d..862b706bf 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
remove_end,
@@ -10,8 +8,8 @@ from ..utils import (
class NBAIE(InfoExtractor):
- _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
- _TEST = {
+ _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$'
+ _TESTS = [{
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
'md5': 'c0edcfc37607344e2ff8f13c378c88a4',
'info_dict': {
@@ -21,12 +19,13 @@ class NBAIE(InfoExtractor):
'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
'duration': 181,
},
- }
+ }, {
+ 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
@@ -37,8 +36,7 @@ class NBAIE(InfoExtractor):
description = self._og_search_description(webpage)
duration = parse_duration(
- self._html_search_meta('duration', webpage, 'duration', fatal=False))
-
+ self._html_search_meta('duration', webpage, 'duration'))
return {
'id': shortened_video_id,
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index e75ab7c39..f840f6532 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -4,32 +4,48 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
+ compat_HTTPError,
+)
+from ..utils import (
ExtractorError,
find_xpath_attr,
)
class NBCIE(InfoExtractor):
- _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
-
- _TEST = {
- 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
- # md5 checksum is not stable
- 'info_dict': {
- 'id': 'bTmnLCvIbaaH',
- 'ext': 'flv',
- 'title': 'I Am a Firefighter',
- 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+ _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+ # md5 checksum is not stable
+ 'info_dict': {
+ 'id': 'bTmnLCvIbaaH',
+ 'ext': 'flv',
+ 'title': 'I Am a Firefighter',
+ 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+ },
+ },
+ {
+ 'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
+ 'info_dict': {
+ 'id': 'XwU9KZkp98TH',
+ 'ext': 'flv',
+ 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
+ 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
+ },
+ 'skip': 'Only works from US',
},
- }
+ ]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url')
+ theplatform_url = self._search_regex(
+ '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
+ webpage, 'theplatform url').replace('_no_endcard', '')
if theplatform_url.startswith('//'):
theplatform_url = 'http:' + theplatform_url
return self.url_result(theplatform_url)
@@ -57,12 +73,22 @@ class NBCNewsIE(InfoExtractor):
'md5': 'b2421750c9f260783721d898f4c42063',
'info_dict': {
'id': 'I1wpAI_zmhsQ',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'How Twitter Reacted To The Snowden Interview',
'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
},
'add_ie': ['ThePlatform'],
},
+ {
+ 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
+ 'md5': 'fdbf39ab73a72df5896b6234ff98518a',
+ 'info_dict': {
+ 'id': 'Wjf9EDR3A_60',
+ 'ext': 'mp4',
+ 'title': 'FULL EPISODE: Family Business',
+ 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
+ },
+ },
]
def _real_extract(self, url):
@@ -97,11 +123,22 @@ class NBCNewsIE(InfoExtractor):
]
for base_url in base_urls:
+ if not base_url:
+ continue
playlist_url = base_url + '?form=MPXNBCNewsAPI'
- all_videos = self._download_json(playlist_url, title)['videos']
try:
- info = next(v for v in all_videos if v['mpxId'] == mpxid)
+ all_videos = self._download_json(playlist_url, title)
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError):
+ continue
+ raise
+
+ if not all_videos or 'videos' not in all_videos:
+ continue
+
+ try:
+ info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
break
except StopIteration:
continue
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index add4b3e5d..f49c66690 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -67,7 +67,7 @@ class NDRIE(InfoExtractor):
thumbnail = None
- video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
+ video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.(lo|hi|hq)\.mp4', type:"video/mp4"},''', page)
if video_url:
thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
if thumbnails:
@@ -91,4 +91,4 @@ class NDRIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py
index 95e7d63aa..2a1ca80df 100644
--- a/youtube_dl/extractor/ndtv.py
+++ b/youtube_dl/extractor/ndtv.py
@@ -27,9 +27,7 @@ class NDTVIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
filename = self._search_regex(
diff --git a/youtube_dl/extractor/nerdcubed.py b/youtube_dl/extractor/nerdcubed.py
new file mode 100644
index 000000000..efc903afa
--- /dev/null
+++ b/youtube_dl/extractor/nerdcubed.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import datetime
+
+from .common import InfoExtractor
+
+
+class NerdCubedFeedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json'
+ _TEST = {
+ 'url': 'http://www.nerdcubed.co.uk/feed.json',
+ 'info_dict': {
+ 'title': 'nerdcubed.co.uk feed',
+ },
+ 'playlist_mincount': 1300,
+ }
+
+ def _real_extract(self, url):
+ feed = self._download_json(url, url, "Downloading NerdCubed JSON feed")
+
+ entries = [{
+ '_type': 'url',
+ 'title': feed_entry['title'],
+ 'uploader': feed_entry['source']['name'] if feed_entry['source'] else None,
+ 'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'),
+ 'url': "http://www.youtube.com/watch?v=" + feed_entry['youtube_id'],
+ } for feed_entry in feed]
+
+ return {
+ '_type': 'playlist',
+ 'title': 'nerdcubed.co.uk feed',
+ 'id': 'nerdcubed-feed',
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py
new file mode 100644
index 000000000..93567d1e3
--- /dev/null
+++ b/youtube_dl/extractor/netzkino.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ int_or_none,
+ js_to_json,
+ parse_iso8601,
+)
+
+
+class NetzkinoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
+ 'md5': '92a3f8b76f8d7220acce5377ea5d4873',
+ 'info_dict': {
+ 'id': 'rakete-zum-mond',
+ 'ext': 'mp4',
+ 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
+ 'comments': 'mincount:3',
+ 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
+ 'upload_date': '20120813',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'timestamp': 1344858571,
+ 'age_limit': 12,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ category_id = mobj.group('category')
+ video_id = mobj.group('id')
+
+ api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
+ api_info = self._download_json(api_url, video_id)
+ info = next(
+ p for p in api_info['posts'] if p['slug'] == video_id)
+ custom_fields = info['custom_fields']
+
+ production_js = self._download_webpage(
+ 'http://www.netzkino.de/beta/dist/production.min.js', video_id,
+ note='Downloading player code')
+ avo_js = self._search_regex(
+ r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
+ production_js, 'URL templates')
+ templates = self._parse_json(
+ avo_js, video_id, transform_source=js_to_json)
+
+ suffix = {
+ 'hds': '.mp4/manifest.f4m',
+ 'hls': '.mp4/master.m3u8',
+ 'pmd': '.mp4',
+ }
+ film_fn = custom_fields['Streaming'][0]
+ formats = [{
+ 'format_id': key,
+ 'ext': 'mp4',
+ 'url': tpl.replace('{}', film_fn) + suffix[key],
+ } for key, tpl in templates.items()]
+ self._sort_formats(formats)
+
+ comments = [{
+ 'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
+ 'id': c['id'],
+ 'author': c['name'],
+ 'html': c['content'],
+ 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
+ } for c in info.get('comments', [])]
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'comments': comments,
+ 'title': info['title'],
+ 'age_limit': int_or_none(custom_fields.get('FSK')[0]),
+ 'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
+ 'description': clean_html(info.get('content')),
+ 'thumbnail': info.get('thumbnail'),
+ 'playlist_title': api_info.get('title'),
+ 'playlist_id': category_id,
+ }
diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py
index 2e72e8915..cd117b04e 100644
--- a/youtube_dl/extractor/newgrounds.py
+++ b/youtube_dl/extractor/newgrounds.py
@@ -23,12 +23,12 @@ class NewgroundsIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
music_id = mobj.group('id')
webpage = self._download_webpage(url, music_id)
-
+
title = self._html_search_regex(
r',"name":"([^"]+)",', webpage, 'music title')
uploader = self._html_search_regex(
r',"artist":"([^"]+)",', webpage, 'music uploader')
-
+
music_url_json_string = self._html_search_regex(
r'({"url":"[^"]+"),', webpage, 'music url') + '}'
music_url_json = json.loads(music_url_json_string)
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py
index 551bd4d7a..85fcad06b 100644
--- a/youtube_dl/extractor/newstube.py
+++ b/youtube_dl/extractor/newstube.py
@@ -89,4 +89,4 @@ class NewstubeIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py
index ba7b77a46..ea077254b 100644
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
)
@@ -12,7 +10,7 @@ from ..utils import (
class NFBIE(InfoExtractor):
IE_NAME = 'nfb'
IE_DESC = 'National Film Board of Canada'
- _VALID_URL = r'https?://(?:www\.)?(nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
@@ -32,18 +30,18 @@ class NFBIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage('https://www.nfb.ca/film/%s' % video_id, video_id, 'Downloading film page')
+ video_id = self._match_id(url)
+ page = self._download_webpage(
+ 'https://www.nfb.ca/film/%s' % video_id, video_id,
+ 'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
- page, 'director id', fatal=False)
+ page, 'director id', fatal=False)
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
- page, 'director name', fatal=False)
+ page, 'director name', fatal=False)
request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
- compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
+ compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
@@ -93,4 +91,4 @@ class NFBIE(InfoExtractor):
'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index cc7c921c3..606e2294e 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
ExtractorError,
- compat_urllib_parse_urlparse,
int_or_none,
remove_end,
)
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 072d9cf8e..d3a4fc513 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -2,12 +2,15 @@ from __future__ import unicode_literals
import re
import json
+import os
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
compat_urllib_parse,
- determine_ext,
+ compat_urllib_parse_urlparse
+)
+from ..utils import (
unified_strdate,
)
@@ -22,21 +25,26 @@ class NHLBaseInfoExtractor(InfoExtractor):
self.report_extraction(video_id)
initial_video_url = info['publishPoint']
- data = compat_urllib_parse.urlencode({
- 'type': 'fvod',
- 'path': initial_video_url.replace('.mp4', '_sd.mp4'),
- })
- path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
- path_doc = self._download_xml(
- path_url, video_id, 'Downloading final video url')
- video_url = path_doc.find('path').text
+ if info['formats'] == '1':
+ parsed_url = compat_urllib_parse_urlparse(initial_video_url)
+ filename, ext = os.path.splitext(parsed_url.path)
+ path = '%s_sd%s' % (filename, ext)
+ data = compat_urllib_parse.urlencode({
+ 'type': 'fvod',
+ 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:])
+ })
+ path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
+ path_doc = self._download_xml(
+ path_url, video_id, 'Downloading final video url')
+ video_url = path_doc.find('path').text
+ else:
+ video_url = initial_video_url
join = compat_urlparse.urljoin
return {
'id': video_id,
'title': info['name'],
'url': video_url,
- 'ext': determine_ext(video_url),
'description': info['description'],
'duration': int(info['duration']),
'thumbnail': join(join(video_url, '/u/'), info['bigImage']),
@@ -46,10 +54,11 @@ class NHLBaseInfoExtractor(InfoExtractor):
class NHLIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com'
- _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/console(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
_TESTS = [{
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
+ 'md5': 'db704a4ea09e8d3988c85e36cc892d09',
'info_dict': {
'id': '453614',
'ext': 'mp4',
@@ -59,6 +68,28 @@ class NHLIE(NHLBaseInfoExtractor):
'upload_date': '20131006',
},
}, {
+ 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h',
+ 'md5': 'd22e82bc592f52d37d24b03531ee9696',
+ 'info_dict': {
+ 'id': '2014020024-628-h',
+ 'ext': 'mp4',
+ 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)',
+ 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014',
+ 'duration': 0,
+ 'upload_date': '20141011',
+ },
+ }, {
+ 'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802',
+ 'md5': 'c78fc64ea01777e426cfc202b746c825',
+ 'info_dict': {
+ 'id': '58665',
+ 'ext': 'flv',
+ 'title': 'Classic Game In Six - April 22, 1979',
+ 'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.',
+ 'duration': 400,
+ 'upload_date': '20100129'
+ },
+ }, {
'url': 'http://video.flames.nhl.com/videocenter/console?id=630616',
'only_matching': True,
}]
@@ -75,7 +106,7 @@ class NHLIE(NHLBaseInfoExtractor):
class NHLVideocenterIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com:videocenter'
IE_DESC = 'NHL videocenter category'
- _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[0-9]+)(?![&?]id=).*?)?$'
+ _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P<catid>[0-9]+)(?![&?]id=).*?)?$'
_TEST = {
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999',
'info_dict': {
@@ -109,10 +140,10 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
response = self._download_webpage(request_url, playlist_title)
response = self._fix_json(response)
if not response.strip():
- self._downloader.report_warning(u'Got an empty reponse, trying '
+ self._downloader.report_warning('Got an empty reponse, trying '
'adding the "newvideos" parameter')
response = self._download_webpage(request_url + '&newvideos=true',
- playlist_title)
+ playlist_title)
response = self._fix_json(response)
videos = json.loads(response)
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 7b85589b7..4c1890416 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -2,15 +2,19 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
- unified_strdate,
- parse_duration,
+)
+from ..utils import (
+ ExtractorError,
int_or_none,
+ parse_duration,
+ unified_strdate,
)
@@ -107,6 +111,9 @@ class NiconicoIE(InfoExtractor):
flv_info_request, video_id,
note='Downloading flv info', errnote='Unable to download flv info')
+ if 'deleted=' in flv_info_webpage:
+ raise ExtractorError('The video has been deleted.',
+ expected=True)
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
@@ -146,3 +153,37 @@ class NiconicoIE(InfoExtractor):
'duration': duration,
'webpage_url': webpage_url,
}
+
+
+class NiconicoPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.nicovideo\.jp/mylist/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://www.nicovideo.jp/mylist/27411728',
+ 'info_dict': {
+ 'id': '27411728',
+ 'title': 'AKB48のオールナイトニッポン',
+ },
+ 'playlist_mincount': 225,
+ }
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+ webpage = self._download_webpage(url, list_id)
+
+ entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);',
+ webpage, 'entries')
+ entries = json.loads(entries_json)
+ entries = [{
+ '_type': 'url',
+ 'ie_key': NiconicoIE.ie_key(),
+ 'url': ('http://www.nicovideo.jp/watch/%s' %
+ entry['item_data']['video_id']),
+ } for entry in entries]
+
+ return {
+ '_type': 'playlist',
+ 'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'),
+ 'id': list_id,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index 33daa0dec..7f842b5c2 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -23,12 +23,14 @@ class NineGagIE(InfoExtractor):
"ext": "mp4",
"description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
"title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
+ 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',
+ 'uploader': 'CompilationChannel',
+ 'upload_date': '20131110',
"view_count": int,
"thumbnail": "re:^https?://",
},
'add_ie': ['Youtube']
- },
- {
+ }, {
'url': 'http://9gag.tv/p/KklwM/alternate-banned-opening-scene-of-gravity?ref=fsidebar',
'info_dict': {
'id': 'KklwM',
@@ -36,6 +38,9 @@ class NineGagIE(InfoExtractor):
'display_id': 'alternate-banned-opening-scene-of-gravity',
"description": "While Gravity was a pretty awesome movie already, YouTuber Krishna Shenoi came up with a way to improve upon it, introducing a much better solution to Sandra Bullock's seemingly endless tumble in space. The ending is priceless.",
'title': "Banned Opening Scene Of \"Gravity\" That Changes The Whole Movie",
+ 'uploader': 'Krishna Shenoi',
+ 'upload_date': '20140401',
+ 'uploader_id': 'krishnashenoi93',
},
}]
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index 7f1bc6377..251e6da07 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -6,13 +6,15 @@ import time
import hashlib
from .common import InfoExtractor
-from ..utils import (
- compat_urllib_request,
+from ..compat import (
+ compat_str,
compat_urllib_parse,
- ExtractorError,
+ compat_urllib_request,
+)
+from ..utils import (
clean_html,
+ ExtractorError,
unified_strdate,
- compat_str,
)
@@ -20,6 +22,7 @@ class NocoIE(InfoExtractor):
_VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
_LOGIN_URL = 'http://noco.tv/do.php'
_API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s'
+ _SUB_LANG_TEMPLATE = '&sub_lang=%s'
_NETRC_MACHINE = 'noco'
_TEST = {
@@ -60,10 +63,12 @@ class NocoIE(InfoExtractor):
if 'erreur' in login:
raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True)
- def _call_api(self, path, video_id, note):
+ def _call_api(self, path, video_id, note, sub_lang=None):
ts = compat_str(int(time.time() * 1000))
tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest()
url = self._API_URL_TEMPLATE % (path, ts, tk)
+ if sub_lang:
+ url += self._SUB_LANG_TEMPLATE % sub_lang
resp = self._download_json(url, video_id, note)
@@ -91,31 +96,34 @@ class NocoIE(InfoExtractor):
formats = []
- for format_id, fmt in medias['fr']['video_list']['none']['quality_list'].items():
-
- video = self._call_api(
- 'shows/%s/video/%s/fr' % (video_id, format_id.lower()),
- video_id, 'Downloading %s video JSON' % format_id)
-
- file_url = video['file']
- if not file_url:
- continue
-
- if file_url in ['forbidden', 'not found']:
- popmessage = video['popmessage']
- self._raise_error(popmessage['title'], popmessage['message'])
-
- formats.append({
- 'url': file_url,
- 'format_id': format_id,
- 'width': fmt['res_width'],
- 'height': fmt['res_lines'],
- 'abr': fmt['audiobitrate'],
- 'vbr': fmt['videobitrate'],
- 'filesize': fmt['filesize'],
- 'format_note': qualities[format_id]['quality_name'],
- 'preference': qualities[format_id]['priority'],
- })
+ for lang, lang_dict in medias['fr']['video_list'].items():
+ for format_id, fmt in lang_dict['quality_list'].items():
+ format_id_extended = '%s-%s' % (lang, format_id) if lang != 'none' else format_id
+
+ video = self._call_api(
+ 'shows/%s/video/%s/fr' % (video_id, format_id.lower()),
+ video_id, 'Downloading %s video JSON' % format_id_extended,
+ lang if lang != 'none' else None)
+
+ file_url = video['file']
+ if not file_url:
+ continue
+
+ if file_url in ['forbidden', 'not found']:
+ popmessage = video['popmessage']
+ self._raise_error(popmessage['title'], popmessage['message'])
+
+ formats.append({
+ 'url': file_url,
+ 'format_id': format_id_extended,
+ 'width': fmt['res_width'],
+ 'height': fmt['res_lines'],
+ 'abr': fmt['audiobitrate'],
+ 'vbr': fmt['videobitrate'],
+ 'filesize': fmt['filesize'],
+ 'format_note': qualities[format_id]['quality_name'],
+ 'preference': qualities[format_id]['priority'],
+ })
self._sort_formats(formats)
@@ -163,4 +171,4 @@ class NocoIE(InfoExtractor):
'uploader_id': uploader_id,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py
index 25e71a56e..c13ff0d65 100644
--- a/youtube_dl/extractor/normalboots.py
+++ b/youtube_dl/extractor/normalboots.py
@@ -22,7 +22,11 @@ class NormalbootsIE(InfoExtractor):
'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',
'uploader': 'JonTron',
'upload_date': '20140125',
- }
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
@@ -31,9 +35,9 @@ class NormalbootsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
- webpage, 'uploader')
+ webpage, 'uploader')
raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
- webpage, 'date')
+ webpage, 'date')
video_upload_date = unified_strdate(raw_upload_date)
player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url')
diff --git a/youtube_dl/extractor/nosvideo.py b/youtube_dl/extractor/nosvideo.py
index f3be8f552..f5ef856db 100644
--- a/youtube_dl/extractor/nosvideo.py
+++ b/youtube_dl/extractor/nosvideo.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
urlencode_postdata,
xpath_text,
xpath_with_ns,
@@ -32,8 +34,7 @@ class NosVideoIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
fields = {
'id': video_id,
diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py
index 2e7ab1e4f..04d779890 100644
--- a/youtube_dl/extractor/novamov.py
+++ b/youtube_dl/extractor/novamov.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urlparse,
+)
from ..utils import (
ExtractorError,
- compat_urlparse
)
@@ -66,4 +68,4 @@ class NovaMovIE(InfoExtractor):
'url': video_url,
'title': title,
'description': description
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
index bfba18418..dec09cdfe 100644
--- a/youtube_dl/extractor/nowvideo.py
+++ b/youtube_dl/extractor/nowvideo.py
@@ -7,7 +7,7 @@ class NowVideoIE(NovaMovIE):
IE_NAME = 'nowvideo'
IE_DESC = 'NowVideo'
- _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co)'}
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co|li)'}
_HOST = 'www.nowvideo.ch'
@@ -25,4 +25,4 @@ class NowVideoIE(NovaMovIE):
'title': 'youtubedl test video _BaW_jenozKc.mp4',
'description': 'Description',
}
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index f36d446d2..175b14583 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -1,17 +1,26 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
+ fix_xml_ampersands,
parse_duration,
qualities,
+ strip_jsonp,
+ unified_strdate,
url_basename,
)
-class NPOIE(InfoExtractor):
+class NPOBaseIE(InfoExtractor):
+ def _get_token(self, video_id):
+ token_page = self._download_webpage(
+ 'http://ida.omroep.nl/npoplayer/i.js',
+ video_id, note='Downloading token')
+ return self._search_regex(
+ r'npoplayer\.token = "(.+?)"', token_page, 'token')
+
+
+class NPOIE(NPOBaseIE):
IE_NAME = 'npo.nl'
_VALID_URL = r'https?://www\.npo\.nl/[^/]+/[^/]+/(?P<id>[^/?]+)'
@@ -50,12 +59,35 @@ class NPOIE(InfoExtractor):
'upload_date': '20130225',
'duration': 3000,
},
- }
+ },
+ {
+ 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
+ 'info_dict': {
+ 'id': 'WO_VPRO_043706',
+ 'ext': 'wmv',
+ 'title': 'De nieuwe mens - Deel 1',
+ 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
+ 'duration': 4680,
+ },
+ 'params': {
+ # mplayer mms download
+ 'skip_download': True,
+ }
+ },
+ # non asf in streams
+ {
+ 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
+ 'md5': 'b3da13de374cbe2d5332a7e910bef97f',
+ 'info_dict': {
+ 'id': 'WO_NOS_762771',
+ 'ext': 'mp4',
+ 'title': 'Hoe gaat Europa verder na Parijs?',
+ },
+ },
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
return self._get_info(video_id)
def _get_info(self, video_id):
@@ -63,41 +95,70 @@ class NPOIE(InfoExtractor):
'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
video_id,
# We have to remove the javascript callback
- transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//.*$', r'\1', j)
- )
- token_page = self._download_webpage(
- 'http://ida.omroep.nl/npoplayer/i.js',
- video_id,
- note='Downloading token'
+ transform_source=strip_jsonp,
)
- token = self._search_regex(r'npoplayer\.token = "(.+?)"', token_page, 'token')
+
+ token = self._get_token(video_id)
formats = []
- quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
- for format_id in metadata['pubopties']:
- format_info = self._download_json(
- 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' % (video_id, format_id, token),
- video_id, 'Downloading %s JSON' % format_id)
- if format_info.get('error_code', 0) or format_info.get('errorcode', 0):
- continue
- streams = format_info.get('streams')
- if streams:
- video_info = self._download_json(
- streams[0] + '&type=json',
- video_id, 'Downloading %s stream JSON' % format_id)
- else:
- video_info = format_info
- video_url = video_info.get('url')
- if not video_url:
- continue
- if format_id == 'adaptive':
- formats.extend(self._extract_m3u8_formats(video_url, video_id))
- else:
+
+ pubopties = metadata.get('pubopties')
+ if pubopties:
+ quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std'])
+ for format_id in pubopties:
+ format_info = self._download_json(
+ 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s'
+ % (video_id, format_id, token),
+ video_id, 'Downloading %s JSON' % format_id)
+ if format_info.get('error_code', 0) or format_info.get('errorcode', 0):
+ continue
+ streams = format_info.get('streams')
+ if streams:
+ video_info = self._download_json(
+ streams[0] + '&type=json',
+ video_id, 'Downloading %s stream JSON' % format_id)
+ else:
+ video_info = format_info
+ video_url = video_info.get('url')
+ if not video_url:
+ continue
+ if format_id == 'adaptive':
+ formats.extend(self._extract_m3u8_formats(video_url, video_id))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ })
+
+ streams = metadata.get('streams')
+ if streams:
+ for i, stream in enumerate(streams):
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ if '.asf' not in stream_url:
+ formats.append({
+ 'url': stream_url,
+ 'quality': stream.get('kwaliteit'),
+ })
+ continue
+ asx = self._download_xml(
+ stream_url, video_id,
+ 'Downloading stream %d ASX playlist' % i,
+ transform_source=fix_xml_ampersands)
+ ref = asx.find('./ENTRY/Ref')
+ if ref is None:
+ continue
+ video_url = ref.get('href')
+ if not video_url:
+ continue
formats.append({
'url': video_url,
- 'format_id': format_id,
- 'quality': quality(format_id),
+ 'ext': stream.get('formaat', 'asf'),
+ 'quality': stream.get('kwaliteit'),
})
+
self._sort_formats(formats)
return {
@@ -111,6 +172,83 @@ class NPOIE(InfoExtractor):
}
+class NPOLiveIE(NPOBaseIE):
+ IE_NAME = 'npo.nl:live'
+ _VALID_URL = r'https?://www\.npo\.nl/live/(?P<id>.+)'
+
+ _TEST = {
+ 'url': 'http://www.npo.nl/live/npo-1',
+ 'info_dict': {
+ 'id': 'LI_NEDERLAND1_136692',
+ 'display_id': 'npo-1',
+ 'ext': 'mp4',
+ 'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'Livestream',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ live_id = self._search_regex(
+ r'data-prid="([^"]+)"', webpage, 'live id')
+
+ metadata = self._download_json(
+ 'http://e.omroep.nl/metadata/%s' % live_id,
+ display_id, transform_source=strip_jsonp)
+
+ token = self._get_token(display_id)
+
+ formats = []
+
+ streams = metadata.get('streams')
+ if streams:
+ for stream in streams:
+ stream_type = stream.get('type').lower()
+ if stream_type == 'ss':
+ continue
+ stream_info = self._download_json(
+ 'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp'
+ % (stream.get('url'), token),
+ display_id, 'Downloading %s JSON' % stream_type)
+ if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0):
+ continue
+ stream_url = self._download_json(
+ stream_info['stream'], display_id,
+ 'Downloading %s URL' % stream_type,
+ transform_source=strip_jsonp)
+ if stream_type == 'hds':
+ f4m_formats = self._extract_f4m_formats(stream_url, display_id)
+ # f4m downloader downloads only piece of live stream
+ for f4m_format in f4m_formats:
+ f4m_format['preference'] = -1
+ formats.extend(f4m_formats)
+ elif stream_type == 'hls':
+ formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4'))
+ else:
+ formats.append({
+ 'url': stream_url,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': live_id,
+ 'display_id': display_id,
+ 'title': self._live_title(metadata['titel']),
+ 'description': metadata['info'],
+ 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
+ 'formats': formats,
+ 'is_live': True,
+ }
+
+
class TegenlichtVproIE(NPOIE):
IE_NAME = 'tegenlicht.vpro.nl'
_VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?'
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 96f0ae1eb..f6de26022 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -7,8 +7,10 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
+ parse_duration,
unified_strdate,
)
+from .subtitles import SubtitlesInfoExtractor
class NRKIE(InfoExtractor):
@@ -71,13 +73,13 @@ class NRKIE(InfoExtractor):
}
-class NRKTVIE(InfoExtractor):
- _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})'
+class NRKTVIE(SubtitlesInfoExtractor):
+ _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
_TESTS = [
{
'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
- 'md5': '7b96112fbae1faf09a6f9ae1aff6cb84',
+ 'md5': 'adf2c5454fa2bf032f47a9f8fb351342',
'info_dict': {
'id': 'MUHH48000314',
'ext': 'flv',
@@ -85,11 +87,11 @@ class NRKTVIE(InfoExtractor):
'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
'upload_date': '20140523',
'duration': 1741.52,
- }
+ },
},
{
'url': 'http://tv.nrk.no/program/mdfp15000514',
- 'md5': 'af01795a31f1cf7265c8657534d8077b',
+ 'md5': '383650ece2b25ecec996ad7b5bb2a384',
'info_dict': {
'id': 'mdfp15000514',
'ext': 'flv',
@@ -97,42 +99,155 @@ class NRKTVIE(InfoExtractor):
'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
'upload_date': '20140524',
'duration': 4605.0,
- }
+ },
+ },
+ {
+ # single playlist video
+ 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+ 'md5': 'adbd1dbd813edaf532b0a253780719c2',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part2',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ },
+ 'skip': 'Only works from Norway',
},
+ {
+ 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+ 'playlist': [
+ {
+ 'md5': '9480285eff92d64f06e02a5367970a7a',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part1',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ },
+ },
+ {
+ 'md5': 'adbd1dbd813edaf532b0a253780719c2',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part2',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ },
+ },
+ ],
+ 'info_dict': {
+ 'id': 'MSPO40010515',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'upload_date': '20150106',
+ 'duration': 6947.5199999999995,
+ },
+ 'skip': 'Only works from Norway',
+ }
]
+ def _seconds2str(self, s):
+ return '%02d:%02d:%02d.%03d' % (s / 3600, (s % 3600) / 60, s % 60, (s % 1) * 1000)
+
+ def _debug_print(self, txt):
+ if self._downloader.params.get('verbose', False):
+ self.to_screen('[debug] %s' % txt)
+
+ def _extract_captions(self, subtitlesurl, video_id, baseurl):
+ url = "%s%s" % (baseurl, subtitlesurl)
+ self._debug_print('%s: Subtitle url: %s' % (video_id, url))
+ captions = self._download_xml(url, video_id, 'Downloading subtitles')
+ lang = captions.get('lang', 'no')
+ ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}'))
+ srt = ''
+ for pos, p in enumerate(ps):
+ begin = parse_duration(p.get('begin'))
+ duration = parse_duration(p.get('dur'))
+ starttime = self._seconds2str(begin)
+ endtime = self._seconds2str(begin + duration)
+ text = '\n'.join(p.itertext())
+ srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text)
+ return {lang: srt}
+
+ def _extract_f4m(self, manifest_url, video_id):
+ return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id)
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ part_id = mobj.group('part_id')
+ baseurl = mobj.group('baseurl')
- page = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(
+ 'title', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
- title = self._html_search_meta('title', page, 'title')
- description = self._html_search_meta('description', page, 'description')
- thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False)
- upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False))
- duration = float_or_none(
- self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False))
+ thumbnail = self._html_search_regex(
+ r'data-posterimage="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+ upload_date = unified_strdate(self._html_search_meta(
+ 'rightsfrom', webpage, 'upload date', fatal=False))
+ duration = float_or_none(self._html_search_regex(
+ r'data-duration="([^"]+)"',
+ webpage, 'duration', fatal=False))
+
+ # playlist
+ parts = re.findall(
+ r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage)
+ if parts:
+ entries = []
+ for current_part_id, stream_url, part_title in parts:
+ if part_id and current_part_id != part_id:
+ continue
+ video_part_id = '%s-part%s' % (video_id, current_part_id)
+ formats = self._extract_f4m(stream_url, video_part_id)
+ entries.append({
+ 'id': video_part_id,
+ 'title': part_title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ })
+ if part_id:
+ if entries:
+ return entries[0]
+ else:
+ playlist = self.playlist_result(entries, video_id, title, description)
+ playlist.update({
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ })
+ return playlist
formats = []
- f4m_url = re.search(r'data-media="([^"]+)"', page)
+ f4m_url = re.search(r'data-media="([^"]+)"', webpage)
if f4m_url:
- formats.append({
- 'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
- 'format_id': 'f4m',
- 'ext': 'flv',
- })
+ formats.extend(self._extract_f4m(f4m_url.group(1), video_id))
- m3u8_url = re.search(r'data-hls-media="([^"]+)"', page)
+ m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
if m3u8_url:
- formats.append({
- 'url': m3u8_url.group(1),
- 'format_id': 'm3u8',
- })
-
+ formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4'))
self._sort_formats(formats)
+ subtitles_url = self._html_search_regex(
+ r'data-subtitlesurl[ ]*=[ ]*"([^"]+)"',
+ webpage, 'subtitle URL', default=None)
+ subtitles = None
+ if subtitles_url:
+ subtitles = self._extract_captions(subtitles_url, video_id, baseurl)
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
return {
'id': video_id,
'title': title,
@@ -141,4 +256,5 @@ class NRKTVIE(InfoExtractor):
'upload_date': upload_date,
'duration': duration,
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py
index ed60314ec..ee740cd9c 100644
--- a/youtube_dl/extractor/ntv.py
+++ b/youtube_dl/extractor/ntv.py
@@ -130,7 +130,7 @@ class NTVIE(InfoExtractor):
'rtmp_conn': 'B:1',
'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128',
'page_url': 'http://www.ntv.ru',
- 'flash_ver': 'LNX 11,2,202,341',
+ 'flash_version': 'LNX 11,2,202,341',
'rtmp_live': True,
'ext': 'flv',
'filesize': int(size.text),
@@ -145,4 +145,4 @@ class NTVIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py
index 58ec81f91..57928f2ae 100644
--- a/youtube_dl/extractor/nuvid.py
+++ b/youtube_dl/extractor/nuvid.py
@@ -3,15 +3,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
parse_duration,
unified_strdate,
- compat_urllib_request,
)
class NuvidIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://m.nuvid.com/video/1310741/',
'md5': 'eab207b7ac4fccfb4e23c86201f11277',
@@ -26,8 +28,7 @@ class NuvidIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
formats = []
@@ -71,4 +72,4 @@ class NuvidIE(InfoExtractor):
'upload_date': upload_date,
'age_limit': 18,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
index 7bf105d38..56e1cad3b 100644
--- a/youtube_dl/extractor/nytimes.py
+++ b/youtube_dl/extractor/nytimes.py
@@ -74,4 +74,4 @@ class NYTimesIE(InfoExtractor):
'duration': duration,
'formats': formats,
'thumbnails': thumbnails,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index 2044e107e..d5b05c18f 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -16,7 +16,6 @@ class OoyalaIE(InfoExtractor):
{
# From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'md5': '3f5cceb3a7bf461d6c29dc466cf8033c',
'info_dict': {
'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
'ext': 'mp4',
@@ -26,7 +25,6 @@ class OoyalaIE(InfoExtractor):
}, {
# Only available for ipad
'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'md5': '4b9754921fddb68106e48c142e2a01e6',
'info_dict': {
'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
'ext': 'mp4',
@@ -43,7 +41,7 @@ class OoyalaIE(InfoExtractor):
@classmethod
def _build_url_result(cls, embed_code):
return cls.url_result(cls._url_for_embed_code(embed_code),
- ie=cls.ie_key())
+ ie=cls.ie_key())
def _extract_result(self, info, more_info):
return {
@@ -97,4 +95,3 @@ class OoyalaIE(InfoExtractor):
}
else:
return self._extract_result(videos_info[0], videos_more_info)
-
diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py
new file mode 100644
index 000000000..2249657eb
--- /dev/null
+++ b/youtube_dl/extractor/openfilm.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ compat_urllib_parse,
+ parse_age_limit,
+ int_or_none,
+)
+
+
+class OpenFilmIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P<id>.+)'
+ _TEST = {
+ 'url': 'http://www.openfilm.com/videos/human-resources-remastered',
+ 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37',
+ 'info_dict': {
+ 'id': '32736',
+ 'display_id': 'human-resources-remastered',
+ 'ext': 'mp4',
+ 'title': 'Human Resources (Remastered)',
+ 'description': 'Social Engineering in the 20th Century.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 7164,
+ 'timestamp': 1334756988,
+ 'upload_date': '20120418',
+ 'uploader_id': '41117',
+ 'view_count': int,
+ 'age_limit': 0,
+ },
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ player = compat_urllib_parse.unquote_plus(
+ self._og_search_video_url(webpage))
+
+ video = json.loads(self._search_regex(
+ r'\bp=({.+?})(?:&|$)', player, 'video JSON'))
+
+ video_url = '%s1.mp4' % video['location']
+ video_id = video.get('video_id')
+ display_id = video.get('alias') or display_id
+ title = video.get('title')
+ description = video.get('description')
+ thumbnail = video.get('main_thumb')
+ duration = int_or_none(video.get('duration'))
+ timestamp = parse_iso8601(video.get('dt_published'), ' ')
+ uploader_id = video.get('user_id')
+ view_count = int_or_none(video.get('views_count'))
+ age_limit = parse_age_limit(video.get('age_limit'))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 011e6be13..4e293392b 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -17,24 +17,39 @@ from ..utils import (
class ORFTVthekIE(InfoExtractor):
IE_NAME = 'orf:tvthek'
IE_DESC = 'ORF TVthek'
- _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'
-
- _TEST = {
- 'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
- 'file': '7319747.mp4',
- 'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
- 'info_dict': {
- 'title': 'Was Sie schon immer über Klassik wissen wollten',
- 'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
- 'duration': 3508,
- 'upload_date': '20140105',
- },
- 'skip': 'Blocked outside of Austria',
- }
+ _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics?/.+?|program/[^/]+)/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
+ 'playlist': [{
+ 'md5': '2942210346ed779588f428a92db88712',
+ 'info_dict': {
+ 'id': '8896777',
+ 'ext': 'mp4',
+ 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
+ 'description': 'md5:c1272f0245537812d4e36419c207b67d',
+ 'duration': 2668,
+ 'upload_date': '20141208',
+ },
+ }],
+ 'skip': 'Blocked outside of Austria / Germany',
+ }, {
+ 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
+ 'playlist': [{
+ 'md5': '68f543909aea49d621dfc7703a11cfaf',
+ 'info_dict': {
+ 'id': '7982259',
+ 'ext': 'mp4',
+ 'title': 'Best of Ingrid Thurnher',
+ 'upload_date': '20140527',
+ 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
+ }
+ }],
+ '_skip': 'Blocked outside of Austria / Germany',
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
data_json = self._search_regex(
@@ -43,7 +58,9 @@ class ORFTVthekIE(InfoExtractor):
def get_segments(all_data):
for data in all_data:
- if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
+ if data['name'] in (
+ 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM',
+ 'Tracker::EPISODE_DETAIL_PAGE_OVER_TOPIC'):
return data['values']['segments']
sdata = get_segments(all_data)
@@ -111,18 +128,19 @@ class ORFTVthekIE(InfoExtractor):
}
-# Audios on ORF radio are only available for 7 days, so we can't add tests.
-
-
class ORFOE1IE(InfoExtractor):
IE_NAME = 'orf:oe1'
IE_DESC = 'Radio Österreich 1'
- _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
+ _VALID_URL = r'http://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)'
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- show_id = mobj.group('id')
+ # Audios on ORF radio are only available for 7 days, so we can't add tests.
+ _TEST = {
+ 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211',
+ 'only_matching': True,
+ }
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
data = self._download_json(
'http://oe1.orf.at/programm/%s/konsole' % show_id,
show_id
@@ -145,7 +163,7 @@ class ORFOE1IE(InfoExtractor):
class ORFFM4IE(InfoExtractor):
- IE_DESC = 'orf:fm4'
+ IE_NAME = 'orf:fm4'
IE_DESC = 'radio FM4'
_VALID_URL = r'http://fm4\.orf\.at/7tage/?#(?P<date>[0-9]+)/(?P<show>\w+)'
@@ -178,4 +196,4 @@ class ORFFM4IE(InfoExtractor):
'title': data['title'],
'description': data['subtitle'],
'entries': entries
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 8f140d626..afce732e1 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
unified_strdate,
US_RATINGS,
)
@@ -80,8 +81,14 @@ class PBSIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20140122',
}
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
+ 'info_dict': {
+ 'id': 'united-states-of-secrets',
+ },
+ 'playlist_count': 2,
}
-
]
def _extract_webpage(self, url):
@@ -96,6 +103,12 @@ class PBSIE(InfoExtractor):
r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
webpage, 'upload date', default=None))
+ # tabbed frontline videos
+ tabbed_videos = re.findall(
+ r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage)
+ if tabbed_videos:
+ return tabbed_videos, presumptive_id, upload_date
+
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
r'class="coveplayerid">([^<]+)<', # coveplayer
@@ -130,9 +143,28 @@ class PBSIE(InfoExtractor):
def _real_extract(self, url):
video_id, display_id, upload_date = self._extract_webpage(url)
+ if isinstance(video_id, list):
+ entries = [self.url_result(
+ 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id)
+ for vid_id in video_id]
+ return self.playlist_result(entries, display_id)
+
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
+ redirect_url = info['alternate_encoding']['url']
+ redirect_info = self._download_json(
+ redirect_url + '?format=json', display_id,
+ 'Downloading video url info')
+ if redirect_info['status'] == 'error':
+ if redirect_info['http_code'] == 403:
+ message = (
+ 'The video is not available in your region due to '
+ 'right restrictions')
+ else:
+ message = redirect_info['message']
+ raise ExtractorError(message, expected=True)
+
rating_str = info.get('rating')
if rating_str is not None:
rating_str = rating_str.rpartition('-')[2]
@@ -142,7 +174,7 @@ class PBSIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'title': info['title'],
- 'url': info['alternate_encoding']['url'],
+ 'url': redirect_info['url'],
'ext': 'mp4',
'description': info['program'].get('description'),
'thumbnail': info.get('image_url'),
diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py
new file mode 100644
index 000000000..a20672c0c
--- /dev/null
+++ b/youtube_dl/extractor/phoenix.py
@@ -0,0 +1,31 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .zdf import extract_from_xml_url
+
+
+class PhoenixIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.phoenix.de/content/884301',
+ 'md5': 'ed249f045256150c92e72dbb70eadec6',
+ 'info_dict': {
+ 'id': '884301',
+ 'ext': 'mp4',
+ 'title': 'Michael Krons mit Hans-Werner Sinn',
+ 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
+ 'upload_date': '20141025',
+ 'uploader': 'Im Dialog',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ internal_id = self._search_regex(
+ r'<div class="phx_vod" id="phx_vod_([0-9]+)"',
+ webpage, 'internal video ID')
+
+ api_url = 'http://www.phoenix.de/php/zdfplayer-v1.3/data/beitragsDetails.php?ak=web&id=%s' % internal_id
+ return extract_from_xml_url(self, video_id, api_url)
diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py
index 8aa69c46e..c66db3cdc 100644
--- a/youtube_dl/extractor/photobucket.py
+++ b/youtube_dl/extractor/photobucket.py
@@ -4,16 +4,17 @@ import json
import re
from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..compat import compat_urllib_parse
class PhotobucketIE(InfoExtractor):
_VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
_TEST = {
'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
- 'file': 'zpsc0c3b9fa.mp4',
'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
'info_dict': {
+ 'id': 'zpsc0c3b9fa',
+ 'ext': 'mp4',
'timestamp': 1367669341,
'upload_date': '20130504',
'uploader': 'rachaneronas',
@@ -31,7 +32,7 @@ class PhotobucketIE(InfoExtractor):
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
- webpage, 'info json')
+ webpage, 'info json')
info = json.loads(info_json)
url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
return {
diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py
index 645a1e06d..45716c75d 100644
--- a/youtube_dl/extractor/played.py
+++ b/youtube_dl/extractor/played.py
@@ -5,10 +5,13 @@ import re
import os.path
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import (
+ ExtractorError,
+)
class PlayedIE(InfoExtractor):
@@ -23,12 +26,18 @@ class PlayedIE(InfoExtractor):
'ext': 'flv',
'title': 'youtube-dl_test_video.mp4',
},
+ 'skip': 'Removed for copyright infringement.', # oh wow
}
def _real_extract(self, url):
video_id = self._match_id(url)
-
orig_webpage = self._download_webpage(url, video_id)
+
+ m_error = re.search(
+ r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage)
+ if m_error:
+ raise ExtractorError(m_error.group('msg'), expected=True)
+
fields = re.findall(
r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
data = dict(fields)
diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py
index ebc046804..9576aed0e 100644
--- a/youtube_dl/extractor/playfm.py
+++ b/youtube_dl/extractor/playfm.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py
index b1322f13f..c3e667e9e 100644
--- a/youtube_dl/extractor/playvid.py
+++ b/youtube_dl/extractor/playvid.py
@@ -3,31 +3,38 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
+from ..utils import (
+ clean_html,
+ ExtractorError,
+)
class PlayvidIE(InfoExtractor):
- _VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
+ _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
_TEST = {
- 'url': 'http://www.playvid.com/watch/agbDDi7WZTV',
- 'md5': '44930f8afa616efdf9482daf4fe53e1e',
+ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu',
+ 'md5': 'ffa2f6b2119af359f544388d8c01eb6c',
'info_dict': {
- 'id': 'agbDDi7WZTV',
+ 'id': 'RnmBNgtrrJu',
'ext': 'mp4',
- 'title': 'Michelle Lewin in Miami Beach',
- 'duration': 240,
+ 'title': 'md5:9256d01c6317e3f703848b5906880dc8',
+ 'duration': 82,
'age_limit': 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ m_error = re.search(
+ r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage)
+ if m_error:
+ raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
+
video_title = None
duration = None
video_thumbnail = None
diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py
index ffafd2380..f20946a2b 100644
--- a/youtube_dl/extractor/podomatic.py
+++ b/youtube_dl/extractor/podomatic.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import int_or_none
+
class PodomaticIE(InfoExtractor):
IE_NAME = 'podomatic'
_VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index bac484c67..954dfccb7 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -8,7 +8,6 @@ from ..utils import (
int_or_none,
js_to_json,
qualities,
- determine_ext,
)
@@ -45,13 +44,18 @@ class PornHdIE(InfoExtractor):
thumbnail = self._search_regex(
r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
- quality = qualities(['SD', 'HD'])
- formats = [{
- 'url': source['file'],
- 'format_id': '%s-%s' % (source['label'], determine_ext(source['file'])),
- 'quality': quality(source['label']),
- } for source in json.loads(js_to_json(self._search_regex(
- r"(?s)'sources'\s*:\s*(\[.+?\])", webpage, 'sources')))]
+ quality = qualities(['sd', 'hd'])
+ sources = json.loads(js_to_json(self._search_regex(
+ r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}\);", webpage, 'sources')))
+ formats = []
+ for container, s in sources.items():
+ for qname, video_url in s.items():
+ formats.append({
+ 'url': video_url,
+ 'container': container,
+ 'format_id': '%s-%s' % (container, qname),
+ 'quality': quality(qname),
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 4118ee956..fb2032832 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -4,10 +4,13 @@ import os
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
+)
+from ..utils import (
+ ExtractorError,
str_to_int,
)
from ..aes import (
@@ -16,13 +19,14 @@ from ..aes import (
class PornHubIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))'
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
_TEST = {
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
- 'file': '648719015.mp4',
'md5': '882f488fa1f0026f023f33576004a2ed',
'info_dict': {
- "uploader": "BABES-COM",
+ 'id': '648719015',
+ 'ext': 'mp4',
+ "uploader": "Babes",
"title": "Seductive Indian beauty strips down and fingers her pink pussy",
"age_limit": 18
}
@@ -35,17 +39,24 @@ class PornHubIE(InfoExtractor):
return count
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
+ error_msg = self._html_search_regex(
+ r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
+ webpage, 'error message', default=None)
+ if error_msg:
+ error_msg = re.sub(r'\s+', ' ', error_msg)
+ raise ExtractorError(
+ 'PornHub said: %s' % error_msg,
+ expected=True, video_id=video_id)
+
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
- r'(?s)From:&nbsp;.+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<',
+ r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
@@ -57,7 +68,7 @@ class PornHubIE(InfoExtractor):
comment_count = self._extract_count(
r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
- video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+ video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
if webpage.find('"encrypted":true') != -1:
password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py
index 04bd3d979..34735c51e 100644
--- a/youtube_dl/extractor/pornotube.py
+++ b/youtube_dl/extractor/pornotube.py
@@ -1,56 +1,94 @@
from __future__ import unicode_literals
-import re
+import json
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
- compat_urllib_parse,
-
- unified_strdate,
+ int_or_none,
)
class PornotubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
+ _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing',
- 'md5': '374dd6dcedd24234453b295209aa69b6',
+ 'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science',
+ 'md5': '60fc5a4f0d93a97968fc7999d98260c9',
'info_dict': {
- 'id': '1689755',
- 'ext': 'flv',
- 'upload_date': '20090708',
- 'title': 'Marilyn-Monroe-Bathing',
- 'age_limit': 18
+ 'id': '4964',
+ 'ext': 'mp4',
+ 'upload_date': '20141203',
+ 'title': 'Weird Hot and Wet Science',
+ 'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0',
+ 'categories': ['Adult Humor', 'Blondes'],
+ 'uploader': 'Alpha Blue Archives',
+ 'thumbnail': 're:^https?://.*\\.jpg$',
+ 'timestamp': 1417582800,
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ video_id = self._match_id(url)
- video_id = mobj.group('videoid')
- video_title = mobj.group('title')
+ # Fetch origin token
+ js_config = self._download_webpage(
+ 'http://www.pornotube.com/assets/src/app/config.js', video_id,
+ note='Download JS config')
+ originAuthenticationSpaceKey = self._search_regex(
+ r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'",
+ js_config, 'originAuthenticationSpaceKey')
+
+ # Fetch actual token
+ token_req_data = {
+ 'authenticationSpaceKey': originAuthenticationSpaceKey,
+ 'credentials': 'Clip Application',
+ }
+ token_req = compat_urllib_request.Request(
+ 'https://api.aebn.net/auth/v1/token/primal',
+ data=json.dumps(token_req_data).encode('utf-8'))
+ token_req.add_header('Content-Type', 'application/json')
+ token_req.add_header('Origin', 'http://www.pornotube.com')
+ token_answer = self._download_json(
+ token_req, video_id, note='Requesting primal token')
+ token = token_answer['tokenKey']
- # Get webpage content
- webpage = self._download_webpage(url, video_id)
+ # Get video URL
+ delivery_req = compat_urllib_request.Request(
+ 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id)
+ delivery_req.add_header('Authorization', token)
+ delivery_info = self._download_json(
+ delivery_req, video_id, note='Downloading delivery information')
+ video_url = delivery_info['mediaUrl']
- # Get the video URL
- VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
- video_url = self._search_regex(VIDEO_URL_RE, webpage, 'video url')
- video_url = compat_urllib_parse.unquote(video_url)
+ # Get additional info (title etc.)
+ info_req = compat_urllib_request.Request(
+ 'https://api.aebn.net/content/v1/clips/%s?expand='
+ 'title,description,primaryImageNumber,startSecond,endSecond,'
+ 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,'
+ 'movie.studios,stars.name,studios.name,categories.name,'
+ 'clipActive,movieActive,publishDate,orientations' % video_id)
+ info_req.add_header('Authorization', token)
+ info = self._download_json(
+ info_req, video_id, note='Downloading metadata')
- #Get the uploaded date
- VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
- upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, 'upload date', fatal=False)
- if upload_date:
- upload_date = unified_strdate(upload_date)
- age_limit = self._rta_search(webpage)
+ timestamp = int_or_none(info.get('publishDate'), scale=1000)
+ uploader = info.get('studios', [{}])[0].get('name')
+ movie_id = info['movie']['movieId']
+ thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % (
+ movie_id, movie_id, info['primaryImageNumber'])
+ categories = [c['name'] for c in info.get('categories')]
return {
'id': video_id,
'url': video_url,
- 'upload_date': upload_date,
- 'title': video_title,
- 'ext': 'flv',
- 'format': 'flv',
- 'age_limit': age_limit,
+ 'title': info['title'],
+ 'description': info.get('description'),
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py
index 463e85501..f536e6e6c 100644
--- a/youtube_dl/extractor/promptfile.py
+++ b/youtube_dl/extractor/promptfile.py
@@ -4,17 +4,18 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- determine_ext,
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+)
class PromptFileIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?promptfile\.com/l/(?P<id>[0-9A-Z\-]+)'
- _FILE_NOT_FOUND_REGEX = r'<div.+id="not_found_msg".+>.+</div>[^-]'
_TEST = {
'url': 'http://www.promptfile.com/l/D21B4746E9-F01462F0FF',
'md5': 'd1451b6302da7215485837aaea882c4c',
@@ -27,11 +28,10 @@ class PromptFileIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
+ if re.search(r'<div.+id="not_found_msg".+>(?!We are).+</div>[^-]', webpage) is not None:
raise ExtractorError('Video %s does not exist' % video_id,
expected=True)
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 619496de7..385681d06 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -5,8 +5,10 @@ import re
from hashlib import sha1
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
unified_strdate,
)
@@ -85,7 +87,7 @@ class ProSiebenSat1IE(InfoExtractor):
'ext': 'mp4',
'title': 'Im Interview: Kai Wiesinger',
'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
- 'upload_date': '20140225',
+ 'upload_date': '20140203',
'duration': 522.56,
},
'params': {
@@ -100,7 +102,7 @@ class ProSiebenSat1IE(InfoExtractor):
'ext': 'mp4',
'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
'description': 'md5:2669cde3febe9bce13904f701e774eb6',
- 'upload_date': '20140225',
+ 'upload_date': '20141014',
'duration': 2410.44,
},
'params': {
@@ -152,12 +154,22 @@ class ProSiebenSat1IE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist',
+ 'info_dict': {
+ 'id': '439664',
+ 'title': 'Episode 8 - Ganze Folge - Playlist',
+ 'description': 'md5:63b8963e71f481782aeea877658dec84',
+ },
+ 'playlist_count': 2,
+ },
]
_CLIPID_REGEXES = [
r'"clip_id"\s*:\s+"(\d+)"',
r'clipid: "(\d+)"',
r'clip[iI]d=(\d+)',
+ r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)",
]
_TITLE_REGEXES = [
r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
@@ -178,11 +190,19 @@ class ProSiebenSat1IE(InfoExtractor):
r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
]
+ _PAGE_TYPE_REGEXES = [
+ r'<meta name="page_type" content="([^"]+)">',
+ r"'itemType'\s*:\s*'([^']*)'",
+ ]
+ _PLAYLIST_ID_REGEXES = [
+ r'content[iI]d=(\d+)',
+ r"'itemId'\s*:\s*'([^']*)'",
+ ]
+ _PLAYLIST_CLIP_REGEXES = [
+ r'(?s)data-qvt=.+?<a href="([^"]+)"',
+ ]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
+ def _extract_clip(self, url, webpage):
clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
access_token = 'testclient'
@@ -280,4 +300,32 @@ class ProSiebenSat1IE(InfoExtractor):
'upload_date': upload_date,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
+
+ def _extract_playlist(self, url, webpage):
+ playlist_id = self._html_search_regex(
+ self._PLAYLIST_ID_REGEXES, webpage, 'playlist id')
+ for regex in self._PLAYLIST_CLIP_REGEXES:
+ playlist_clips = re.findall(regex, webpage)
+ if playlist_clips:
+ title = self._html_search_regex(
+ self._TITLE_REGEXES, webpage, 'title')
+ description = self._html_search_regex(
+ self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+ entries = [
+ self.url_result(
+ re.match('(.+?//.+?)/', url).group(1) + clip_path,
+ 'ProSiebenSat1')
+ for clip_path in playlist_clips]
+ return self.playlist_result(entries, playlist_id, title, description)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ page_type = self._search_regex(
+ self._PAGE_TYPE_REGEXES, webpage,
+ 'page type', default='clip').lower()
+ if page_type == 'clip':
+ return self._extract_clip(url, webpage)
+ elif page_type == 'playlist':
+ return self._extract_playlist(url, webpage)
diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py
new file mode 100644
index 000000000..af7d76cf4
--- /dev/null
+++ b/youtube_dl/extractor/quickvid.py
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urlparse,
+)
+from ..utils import (
+ determine_ext,
+ int_or_none,
+)
+
+
+class QuickVidIE(InfoExtractor):
+ _VALID_URL = r'https?://(www\.)?quickvid\.org/watch\.php\?v=(?P<id>[a-zA-Z_0-9-]+)'
+ _TEST = {
+ 'url': 'http://quickvid.org/watch.php?v=sUQT3RCG8dx',
+ 'md5': 'c0c72dd473f260c06c808a05d19acdc5',
+ 'info_dict': {
+ 'id': 'sUQT3RCG8dx',
+ 'ext': 'mp4',
+ 'title': 'Nick Offerman\'s Summer Reading Recap',
+ 'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$',
+ 'view_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<h2>(.*?)</h2>', webpage, 'title')
+ view_count = int_or_none(self._html_search_regex(
+ r'(?s)<div id="views">(.*?)</div>',
+ webpage, 'view count', fatal=False))
+ video_code = self._search_regex(
+ r'(?s)<video id="video"[^>]*>(.*?)</video>', webpage, 'video code')
+ formats = [
+ {
+ 'url': compat_urlparse.urljoin(url, src),
+ 'format_id': determine_ext(src, None),
+ } for src in re.findall('<source\s+src="([^"]+)"', video_code)
+ ]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'view_count': view_count,
+ }
diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py
new file mode 100644
index 000000000..0d706312e
--- /dev/null
+++ b/youtube_dl/extractor/radiobremen.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class RadioBremenIE(InfoExtractor):
+ _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)'
+ IE_NAME = 'radiobremen'
+
+ _TEST = {
+ 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720',
+ 'info_dict': {
+ 'id': '114720',
+ 'ext': 'mp4',
+ 'duration': 1685,
+ 'width': 512,
+ 'title': 'buten un binnen vom 22. Dezember',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ meta_url = "http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s" % video_id
+ meta_doc = self._download_webpage(
+ meta_url, video_id, 'Downloading metadata')
+ title = self._html_search_regex(
+ r"<h1.*>(?P<title>.+)</h1>", meta_doc, "title")
+ description = self._html_search_regex(
+ r"<p>(?P<description>.*)</p>", meta_doc, "description", fatal=False)
+ duration = parse_duration(self._html_search_regex(
+ r"L&auml;nge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>",
+ meta_doc, "duration", fatal=False))
+
+ page_doc = self._download_webpage(
+ url, video_id, 'Downloading video information')
+ mobj = re.search(
+ r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)",
+ page_doc)
+ video_url = (
+ "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" %
+ (video_id, video_id, mobj.group("secret"), mobj.group('width')))
+
+ formats = [{
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'width': int(mobj.group("width")),
+ }]
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'formats': formats,
+ 'thumbnail': mobj.group('thumbnail'),
+ }
diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py
new file mode 100644
index 000000000..f95bc9454
--- /dev/null
+++ b/youtube_dl/extractor/radiode.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+
+
+class RadioDeIE(InfoExtractor):
+ IE_NAME = 'radio.de'
+ _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)'
+ _TEST = {
+ 'url': 'http://ndr2.radio.de/',
+ 'md5': '3b4cdd011bc59174596b6145cda474a4',
+ 'info_dict': {
+ 'id': 'ndr2',
+ 'ext': 'mp3',
+ 'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:591c49c702db1a33751625ebfb67f273',
+ 'thumbnail': 're:^https?://.*\.png',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ radio_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, radio_id)
+
+ broadcast = json.loads(self._search_regex(
+ r'_getBroadcast\s*=\s*function\(\s*\)\s*{\s*return\s+({.+?})\s*;\s*}',
+ webpage, 'broadcast'))
+
+ title = self._live_title(broadcast['name'])
+ description = broadcast.get('description') or broadcast.get('shortDescription')
+ thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl')
+
+ formats = [{
+ 'url': stream['streamUrl'],
+ 'ext': stream['streamContentFormat'].lower(),
+ 'acodec': stream['streamContentFormat'],
+ 'abr': stream['bitRate'],
+ 'asr': stream['sampleRate']
+ } for stream in broadcast['streamUrls']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': radio_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'is_live': True,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index ba3dd707f..aa26b7e0b 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -3,10 +3,12 @@ from __future__ import unicode_literals
import re
from .subtitles import SubtitlesInfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+)
from ..utils import (
parse_duration,
unified_strdate,
- compat_urllib_parse,
)
@@ -119,4 +121,4 @@ class RaiIE(SubtitlesInfoExtractor):
if captions.endswith(STL_EXT):
captions = captions[:-len(STL_EXT)] + SRT_EXT
subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions)
- return subtitles \ No newline at end of file
+ return subtitles
diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py
index 2c53ed2e1..0f8f3ebde 100644
--- a/youtube_dl/extractor/rbmaradio.py
+++ b/youtube_dl/extractor/rbmaradio.py
@@ -33,7 +33,7 @@ class RBMARadioIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
- webpage, 'json data', flags=re.MULTILINE)
+ webpage, 'json data', flags=re.MULTILINE)
try:
data = json.loads(json_data)
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py
index d1e12dd8d..846b76c81 100644
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -9,32 +7,23 @@ class RedTubeIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.redtube.com/66418',
- 'file': '66418.mp4',
- # md5 varies from time to time, as in
- # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295
- #'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
'info_dict': {
+ 'id': '66418',
+ 'ext': 'mp4',
"title": "Sucked on a toilet",
"age_limit": 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
- video_extension = 'mp4'
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
video_url = self._html_search_regex(
- r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL')
-
+ r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
video_title = self._html_search_regex(
r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
- webpage, u'title')
-
+ webpage, 'title')
video_thumbnail = self._og_search_thumbnail(webpage)
# No self-labeling, but they describe themselves as
@@ -44,7 +33,7 @@ class RedTubeIE(InfoExtractor):
return {
'id': video_id,
'url': video_url,
- 'ext': video_extension,
+ 'ext': 'mp4',
'title': video_title,
'thumbnail': video_thumbnail,
'age_limit': age_limit,
diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py
new file mode 100644
index 000000000..b17c2bfc0
--- /dev/null
+++ b/youtube_dl/extractor/restudy.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class RestudyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?restudy\.dk/video/play/id/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://www.restudy.dk/video/play/id/1637',
+ 'info_dict': {
+ 'id': '1637',
+ 'ext': 'flv',
+ 'title': 'Leiden-frosteffekt',
+ 'description': 'Denne video er et eksperiment med flydende kvælstof.',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage).strip()
+ description = self._og_search_description(webpage).strip()
+
+ formats = self._extract_smil_formats(
+ 'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id,
+ video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py
index 9fbdb9fcb..59dc137cc 100644
--- a/youtube_dl/extractor/ringtv.py
+++ b/youtube_dl/extractor/ringtv.py
@@ -41,4 +41,3 @@ class RingTVIE(InfoExtractor):
'thumbnail': thumbnail_url,
'description': description,
}
-
diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py
index a6ad59465..962b524e9 100644
--- a/youtube_dl/extractor/ro220.py
+++ b/youtube_dl/extractor/ro220.py
@@ -1,43 +1,43 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- clean_html,
- compat_parse_qs,
-)
+from ..compat import compat_urllib_parse_unquote
class Ro220IE(InfoExtractor):
IE_NAME = '220.ro'
- _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<video_id>[^/]+)'
+ _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<id>[^/]+)'
_TEST = {
- "url": "http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/",
- 'file': 'LYV6doKo7f.mp4',
+ 'url': 'http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/',
'md5': '03af18b73a07b4088753930db7a34add',
'info_dict': {
- "title": "Luati-le Banii sez 4 ep 1",
- "description": "re:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$",
+ 'id': 'LYV6doKo7f',
+ 'ext': 'mp4',
+ 'title': 'Luati-le Banii sez 4 ep 1',
+ 'description': 're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$',
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- flashVars_str = self._search_regex(
- r'<param name="flashVars" value="([^"]+)"',
- webpage, 'flashVars')
- flashVars = compat_parse_qs(flashVars_str)
+ url = compat_urllib_parse_unquote(self._search_regex(
+ r'(?s)clip\s*:\s*{.*?url\s*:\s*\'([^\']+)\'', webpage, 'url'))
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': url,
+ 'ext': 'mp4',
+ }]
return {
- '_type': 'video',
'id': video_id,
- 'ext': 'mp4',
- 'url': flashVars['videoURL'][0],
- 'title': flashVars['title'][0],
- 'description': clean_html(flashVars['desc'][0]),
- 'thumbnail': flashVars['preview'][0],
+ 'formats': formats,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py
new file mode 100644
index 000000000..04158b993
--- /dev/null
+++ b/youtube_dl/extractor/rte.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+ float_or_none,
+)
+
+
+class RteIE(InfoExtractor):
+ _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/(?P<id>[0-9]+)/'
+ _TEST = {
+ 'url': 'http://www.rte.ie/player/de/show/10363114/',
+ 'info_dict': {
+ 'id': '10363114',
+ 'ext': 'mp4',
+ 'title': 'One News',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'The One O\'Clock News followed by Weather.',
+ 'duration': 436.844,
+ },
+ 'params': {
+ 'skip_download': 'f4m fails with --test atm'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+ description = self._html_search_meta('description', webpage, 'description')
+ duration = float_or_none(self._html_search_meta(
+ 'duration', webpage, 'duration', fatal=False), 1000)
+
+ thumbnail_id = self._search_regex(
+ r'<meta name="thumbnail" content="uri:irus:(.*?)" />', webpage, 'thumbnail')
+ thumbnail = 'http://img.rasset.ie/' + thumbnail_id + '.jpg'
+
+ feeds_url = self._html_search_meta("feeds-prefix", webpage, 'feeds url') + video_id
+ json_string = self._download_json(feeds_url, video_id)
+
+ # f4m_url = server + relative_url
+ f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url']
+ f4m_formats = self._extract_f4m_formats(f4m_url, video_id)
+ f4m_formats = [{
+ 'format_id': f['format_id'],
+ 'url': f['url'],
+ 'ext': 'mp4',
+ 'width': f['width'],
+ 'height': f['height'],
+ } for f in f4m_formats]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': f4m_formats,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index 0ab1eb69c..a3ca79f2c 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -8,7 +8,7 @@ from ..utils import parse_duration
class RtlXlIE(InfoExtractor):
IE_NAME = 'rtlxl.nl'
- _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
+ _VALID_URL = r'https?://(www\.)?rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
_TEST = {
'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
@@ -38,12 +38,13 @@ class RtlXlIE(InfoExtractor):
progname = info['abstracts'][0]['name']
subtitle = material['title'] or info['episodes'][0]['name']
- videopath = material['videopath']
- f4m_url = 'http://manifest.us.rtl.nl' + videopath
+ # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118)
+ videopath = material['videopath'].replace('.f4m', '.m3u8')
+ m3u8_url = 'http://manifest.us.rtl.nl' + videopath
- formats = self._extract_f4m_formats(f4m_url, uuid)
+ formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')
- video_urlpart = videopath.split('/flash/')[1][:-4]
+ video_urlpart = videopath.split('/flash/')[1][:-5]
PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
formats.extend([
@@ -54,9 +55,12 @@ class RtlXlIE(InfoExtractor):
{
'url': PG_URL_TEMPLATE % ('a3m', video_urlpart),
'format_id': 'pg-hd',
+ 'quality': 0,
}
])
+ self._sort_formats(formats)
+
return {
'id': uuid,
'title': '%s - %s' % (progname, subtitle),
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
index a45884b25..285c3c4be 100644
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -81,7 +81,7 @@ class RTLnowIE(InfoExtractor):
'id': '99205',
'ext': 'flv',
'title': 'Medicopter 117 - Angst!',
- 'description': 'md5:895b1df01639b5f61a04fc305a5cb94d',
+ 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
'upload_date': '20080928',
'duration': 2691,
@@ -122,7 +122,7 @@ class RTLnowIE(InfoExtractor):
playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
videoinfo = playerdata.find('./playlist/videoinfo')
-
+
formats = []
for filename in videoinfo.findall('filename'):
mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
@@ -153,4 +153,4 @@ class RTLnowIE(InfoExtractor):
'upload_date': upload_date,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py
new file mode 100644
index 000000000..7736cabba
--- /dev/null
+++ b/youtube_dl/extractor/rtp.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class RTPIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
+ _TESTS = [{
+ 'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
+ 'info_dict': {
+ 'id': 'e174042',
+ 'ext': 'mp3',
+ 'title': 'Paixões Cruzadas',
+ 'description': 'As paixões musicais de António Cartaxo e António Macedo',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ 'params': {
+ 'skip_download': True, # RTMP download
+ },
+ }, {
+ 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_meta(
+ 'twitter:title', webpage, display_name='title', fatal=True)
+ description = self._html_search_meta('description', webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ player_config = self._search_regex(
+ r'(?s)RTPPLAY\.player\.newPlayer\(\s*(\{.*?\})\s*\)', webpage, 'player config')
+ config = json.loads(js_to_json(player_config))
+
+ path, ext = config.get('file').rsplit('.', 1)
+ formats = [{
+ 'app': config.get('application'),
+ 'play_path': '{ext:s}:{path:s}'.format(ext=ext, path=path),
+ 'page_url': url,
+ 'url': 'rtmp://{streamer:s}/{application:s}'.format(**config),
+ 'rtmp_live': config.get('live', False),
+ 'ext': ext,
+ 'vcodec': config.get('type') == 'audio' and 'none' or None,
+ 'player_url': 'http://programas.rtp.pt/play/player.swf?v3',
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index e8199b114..5e84c1098 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -4,18 +4,20 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
unescapeHTML,
- compat_str,
)
class RTSIE(InfoExtractor):
IE_DESC = 'RTS.ch'
- _VALID_URL = r'^https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-.*?\.html'
+ _VALID_URL = r'https?://(?:www\.)?rts\.ch/(?:(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+))'
_TESTS = [
{
@@ -23,6 +25,7 @@ class RTSIE(InfoExtractor):
'md5': '753b877968ad8afaeddccc374d4256a5',
'info_dict': {
'id': '3449373',
+ 'display_id': 'les-enfants-terribles',
'ext': 'mp4',
'duration': 1488,
'title': 'Les Enfants Terribles',
@@ -30,7 +33,8 @@ class RTSIE(InfoExtractor):
'uploader': 'Divers',
'upload_date': '19680921',
'timestamp': -40280400,
- 'thumbnail': 're:^https?://.*\.image'
+ 'thumbnail': 're:^https?://.*\.image',
+ 'view_count': int,
},
},
{
@@ -38,6 +42,7 @@ class RTSIE(InfoExtractor):
'md5': 'c148457a27bdc9e5b1ffe081a7a8337b',
'info_dict': {
'id': '5624067',
+ 'display_id': 'entre-ciel-et-mer',
'ext': 'mp4',
'duration': 3720,
'title': 'Les yeux dans les cieux - Mon homard au Canada',
@@ -45,7 +50,8 @@ class RTSIE(InfoExtractor):
'uploader': 'Passe-moi les jumelles',
'upload_date': '20140404',
'timestamp': 1396635300,
- 'thumbnail': 're:^https?://.*\.image'
+ 'thumbnail': 're:^https?://.*\.image',
+ 'view_count': int,
},
},
{
@@ -53,6 +59,7 @@ class RTSIE(InfoExtractor):
'md5': 'b4326fecd3eb64a458ba73c73e91299d',
'info_dict': {
'id': '5745975',
+ 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski',
'ext': 'mp4',
'duration': 48,
'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
@@ -60,7 +67,8 @@ class RTSIE(InfoExtractor):
'uploader': 'Hockey',
'upload_date': '20140403',
'timestamp': 1396556882,
- 'thumbnail': 're:^https?://.*\.image'
+ 'thumbnail': 're:^https?://.*\.image',
+ 'view_count': int,
},
'skip': 'Blocked outside Switzerland',
},
@@ -69,6 +77,7 @@ class RTSIE(InfoExtractor):
'md5': '9bb06503773c07ce83d3cbd793cebb91',
'info_dict': {
'id': '5745356',
+ 'display_id': 'londres-cachee-par-un-epais-smog',
'ext': 'mp4',
'duration': 33,
'title': 'Londres cachée par un épais smog',
@@ -76,7 +85,8 @@ class RTSIE(InfoExtractor):
'uploader': 'Le Journal en continu',
'upload_date': '20140403',
'timestamp': 1396537322,
- 'thumbnail': 're:^https?://.*\.image'
+ 'thumbnail': 're:^https?://.*\.image',
+ 'view_count': int,
},
},
{
@@ -84,6 +94,7 @@ class RTSIE(InfoExtractor):
'md5': 'dd8ef6a22dff163d063e2a52bc8adcae',
'info_dict': {
'id': '5706148',
+ 'display_id': 'urban-hippie-de-damien-krisl-03-04-2014',
'ext': 'mp3',
'duration': 123,
'title': '"Urban Hippie", de Damien Krisl',
@@ -92,22 +103,44 @@ class RTSIE(InfoExtractor):
'timestamp': 1396551600,
},
},
+ {
+ 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260',
+ 'md5': '968777c8779e5aa2434be96c54e19743',
+ 'info_dict': {
+ 'id': '6348260',
+ 'display_id': 'le-19h30',
+ 'ext': 'mp4',
+ 'duration': 1796,
+ 'title': 'Le 19h30',
+ 'description': '',
+ 'uploader': 'Le 19h30',
+ 'upload_date': '20141201',
+ 'timestamp': 1417458600,
+ 'thumbnail': 're:^https?://.*\.image',
+ 'view_count': int,
+ },
+ },
+ {
+ 'url': 'http://www.rts.ch/play/tv/le-19h30/video/le-chantier-du-nouveau-parlement-vaudois-a-permis-une-trouvaille-historique?id=6348280',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
+ video_id = m.group('id') or m.group('id_new')
+ display_id = m.group('display_id') or m.group('display_id_new')
def download_json(internal_id):
return self._download_json(
'http://www.rts.ch/a/%s.html?f=json/article' % internal_id,
- video_id)
+ display_id)
all_info = download_json(video_id)
# video_id extracted out of URL is not always a real id
if 'video' not in all_info and 'audio' not in all_info:
- page = self._download_webpage(url, video_id)
+ page = self._download_webpage(url, display_id)
internal_id = self._html_search_regex(
r'<(?:video|audio) data-id="([0-9]+)"', page,
'internal video id')
@@ -143,6 +176,7 @@ class RTSIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'formats': formats,
'title': info['title'],
'description': info.get('intro'),
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 4dd35a47b..0ce22d60c 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -54,7 +54,6 @@ def _decrypt_url(png):
return url
-
class RTVEALaCartaIE(InfoExtractor):
IE_NAME = 'rtve.es:alacarta'
IE_DESC = 'RTVE a la carta'
diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py
index 55b58e5e6..0e470e73f 100644
--- a/youtube_dl/extractor/ruhd.py
+++ b/youtube_dl/extractor/ruhd.py
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -21,19 +19,20 @@ class RUHDIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
r'<param name="src" value="([^"]+)"', webpage, 'video url')
title = self._html_search_regex(
- r'<title>([^<]+)&nbsp;&nbsp; RUHD.ru - Видео Высокого качества №1 в России!</title>', webpage, 'title')
+ r'<title>([^<]+)&nbsp;&nbsp; RUHD.ru - Видео Высокого качества №1 в России!</title>',
+ webpage, 'title')
description = self._html_search_regex(
- r'(?s)<div id="longdesc">(.+?)<span id="showlink">', webpage, 'description', fatal=False)
+ r'(?s)<div id="longdesc">(.+?)<span id="showlink">',
+ webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
- r'<param name="previewImage" value="([^"]+)"', webpage, 'thumbnail', fatal=False)
+ r'<param name="previewImage" value="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = 'http://www.ruhd.ru' + thumbnail
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 0c8790da2..5b1c3577a 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -5,10 +5,12 @@ import re
import itertools
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
- unified_strdate,
+)
+from ..utils import (
ExtractorError,
+ unified_strdate,
)
@@ -36,9 +38,7 @@ class RutubeIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
video = self._download_json(
'http://rutube.ru/api/video/%s/?format=json' % video_id,
video_id, 'Downloading video JSON')
@@ -53,6 +53,7 @@ class RutubeIE(InfoExtractor):
m3u8_url = options['video_balancer'].get('m3u8')
if m3u8_url is None:
raise ExtractorError('Couldn\'t find m3u8 manifest url')
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
return {
'id': video['id'],
@@ -60,8 +61,7 @@ class RutubeIE(InfoExtractor):
'description': video['description'],
'duration': video['duration'],
'view_count': video['hits'],
- 'url': m3u8_url,
- 'ext': 'mp4',
+ 'formats': formats,
'thumbnail': video['thumbnail_url'],
'uploader': author.get('name'),
'uploader_id': compat_str(author['id']) if author else None,
@@ -70,6 +70,37 @@ class RutubeIE(InfoExtractor):
}
+class RutubeEmbedIE(InfoExtractor):
+ IE_NAME = 'rutube:embed'
+ IE_DESC = 'Rutube embedded videos'
+ _VALID_URL = 'https?://rutube\.ru/video/embed/(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
+ 'info_dict': {
+ 'id': 'a10e53b86e8f349080f718582ce4c661',
+ 'ext': 'mp4',
+ 'upload_date': '20131223',
+ 'uploader_id': '297833',
+ 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89',
+ 'uploader': 'subziro89 ILya',
+ 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',
+ },
+ 'params': {
+ 'skip_download': 'Requires ffmpeg',
+ },
+ }
+
+ def _real_extract(self, url):
+ embed_id = self._match_id(url)
+ webpage = self._download_webpage(url, embed_id)
+
+ canonical_url = self._html_search_regex(
+ r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,
+ 'Canonical URL')
+ return self.url_result(canonical_url, 'Rutube')
+
+
class RutubeChannelIE(InfoExtractor):
IE_NAME = 'rutube:channel'
IE_DESC = 'Rutube channels'
@@ -114,8 +145,7 @@ class RutubeMovieIE(RutubeChannelIE):
_PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- movie_id = mobj.group('id')
+ movie_id = self._match_id(url)
movie = self._download_json(
self._MOVIE_TEMPLATE % movie_id, movie_id,
'Downloading movie JSON')
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
index f737b4e5f..a73e6f331 100644
--- a/youtube_dl/extractor/rutv.py
+++ b/youtube_dl/extractor/rutv.py
@@ -191,4 +191,4 @@ class RUTVIE(InfoExtractor):
'view_count': view_count,
'duration': duration,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py
index 409f8540a..b8775c2f9 100644
--- a/youtube_dl/extractor/sbs.py
+++ b/youtube_dl/extractor/sbs.py
@@ -27,8 +27,7 @@ class SBSIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
},
'add_ies': ['generic'],
- },
- {
+ }, {
'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
'only_matching': True,
}]
diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py
index 55a481cc0..3bf93c870 100644
--- a/youtube_dl/extractor/scivee.py
+++ b/youtube_dl/extractor/scivee.py
@@ -53,4 +53,4 @@ class SciVeeIE(InfoExtractor):
'description': description,
'thumbnail': 'http://www.scivee.tv/assets/videothumb/%s' % video_id,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py
index 306869e6a..dfd897ba3 100644
--- a/youtube_dl/extractor/screencast.py
+++ b/youtube_dl/extractor/screencast.py
@@ -1,14 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_parse_qs,
compat_urllib_request,
)
+from ..utils import (
+ ExtractorError,
+)
class ScreencastIE(InfoExtractor):
@@ -57,8 +57,7 @@ class ScreencastIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
@@ -96,7 +95,7 @@ class ScreencastIE(InfoExtractor):
if title is None:
title = self._html_search_regex(
[r'<b>Title:</b> ([^<]*)</div>',
- r'class="tabSeperator">></span><span class="tabText">(.*?)<'],
+ r'class="tabSeperator">></span><span class="tabText">(.*?)<'],
webpage, 'title')
thumbnail = self._og_search_thumbnail(webpage)
description = self._og_search_description(webpage, default=None)
diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py
new file mode 100644
index 000000000..05337421c
--- /dev/null
+++ b/youtube_dl/extractor/screencastomatic.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ ExtractorError,
+ js_to_json,
+)
+
+
+class ScreencastOMaticIE(InfoExtractor):
+ _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'
+ _TEST = {
+ 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl',
+ 'md5': '483583cb80d92588f15ccbedd90f0c18',
+ 'info_dict': {
+ 'id': 'c2lD3BeOPl',
+ 'ext': 'mp4',
+ 'title': 'Welcome to 3-4 Philosophy @ DECV!',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ setup_js = self._search_regex(
+ r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);",
+ webpage, 'setup code')
+ data = self._parse_json(setup_js, video_id, transform_source=js_to_json)
+ try:
+ video_data = next(
+ m for m in data['modes'] if m.get('type') == 'html5')
+ except StopIteration:
+ raise ExtractorError('Could not find any video entries!')
+ video_url = compat_urlparse.urljoin(url, video_data['config']['file'])
+ thumbnail = data.get('image')
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py
new file mode 100644
index 000000000..6c9fdb7c1
--- /dev/null
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -0,0 +1,178 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
+
+
+class ScreenwaveMediaIE(InfoExtractor):
+ _VALID_URL = r'http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
+
+ _TESTS = [{
+ 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ playerdata = self._download_webpage(url, video_id, 'Downloading player webpage')
+
+ vidtitle = self._search_regex(
+ r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
+ vidurl = self._search_regex(
+ r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/')
+
+ videolist_url = None
+
+ mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
+ if mobj:
+ videoserver = mobj.group('videoserver')
+ mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
+ vidid = mobj.group('vidid') if mobj else video_id
+ videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
+ else:
+ mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
+ if mobj:
+ videolist_url = mobj.group('smil')
+
+ if videolist_url:
+ videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
+ formats = []
+ baseurl = vidurl[:vidurl.rfind('/') + 1]
+ for video in videolist.findall('.//video'):
+ src = video.get('src')
+ if not src:
+ continue
+ file_ = src.partition(':')[-1]
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ bitrate = int_or_none(video.get('system-bitrate'), scale=1000)
+ format = {
+ 'url': baseurl + file_,
+ 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
+ }
+ if width or height:
+ format.update({
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ })
+ else:
+ format.update({
+ 'abr': bitrate,
+ 'vcodec': 'none',
+ })
+ formats.append(format)
+ else:
+ formats = [{
+ 'url': vidurl,
+ }]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': vidtitle,
+ 'formats': formats,
+ }
+
+
+class CinemassacreIE(InfoExtractor):
+ _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
+ _TESTS = [
+ {
+ 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+ 'md5': 'fde81fbafaee331785f58cd6c0d46190',
+ 'info_dict': {
+ 'id': 'Cinemassacre-19911',
+ 'ext': 'mp4',
+ 'upload_date': '20121110',
+ 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+ 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+ },
+ },
+ {
+ 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+ 'md5': 'd72f10cd39eac4215048f62ab477a511',
+ 'info_dict': {
+ 'id': 'Cinemassacre-521be8ef82b16',
+ 'ext': 'mp4',
+ 'upload_date': '20131002',
+ 'title': 'The Mummy’s Hand (1940)',
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+ video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
+
+ webpage = self._download_webpage(url, display_id)
+
+ playerdata_url = self._search_regex(
+ r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+ webpage, 'player data URL')
+ video_title = self._html_search_regex(
+ r'<title>(?P<title>.+?)\|', webpage, 'title')
+ video_description = self._html_search_regex(
+ r'<div class="entry-content">(?P<description>.+?)</div>',
+ webpage, 'description', flags=re.DOTALL, fatal=False)
+ video_thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': display_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'upload_date': video_date,
+ 'thumbnail': video_thumbnail,
+ 'url': playerdata_url,
+ }
+
+
+class TeamFourIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P<id>[a-z0-9\-]+)/?'
+ _TEST = {
+ 'url': 'http://teamfourstar.com/video/a-moment-with-tfs-episode-4/',
+ 'info_dict': {
+ 'id': 'TeamFourStar-5292a02f20bfa',
+ 'ext': 'mp4',
+ 'upload_date': '20130401',
+ 'description': 'Check out this and more on our website: http://teamfourstar.com\nTFS Store: http://sharkrobot.com/team-four-star\nFollow on Twitter: http://twitter.com/teamfourstar\nLike on FB: http://facebook.com/teamfourstar',
+ 'title': 'A Moment With TFS Episode 4',
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ playerdata_url = self._search_regex(
+ r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+ webpage, 'player data URL')
+
+ video_title = self._html_search_regex(
+ r'<div class="heroheadingtitle">(?P<title>.+?)</div>',
+ webpage, 'title')
+ video_date = unified_strdate(self._html_search_regex(
+ r'<div class="heroheadingdate">(?P<date>.+?)</div>',
+ webpage, 'date', fatal=False))
+ video_description = self._html_search_regex(
+ r'(?s)<div class="postcontent">(?P<description>.+?)</div>',
+ webpage, 'description', fatal=False)
+ video_thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': display_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'upload_date': video_date,
+ 'thumbnail': video_thumbnail,
+ 'url': playerdata_url,
+ }
diff --git a/youtube_dl/extractor/servingsys.py b/youtube_dl/extractor/servingsys.py
index 1dc551d5c..16dc3736b 100644
--- a/youtube_dl/extractor/servingsys.py
+++ b/youtube_dl/extractor/servingsys.py
@@ -67,5 +67,3 @@ class ServingSysIE(InfoExtractor):
'title': title,
'entries': entries,
}
-
- \ No newline at end of file
diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py
new file mode 100644
index 000000000..6365a8779
--- /dev/null
+++ b/youtube_dl/extractor/sexu.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class SexuIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://sexu.com/961791/',
+ 'md5': 'ff615aca9691053c94f8f10d96cd7884',
+ 'info_dict': {
+ 'id': '961791',
+ 'ext': 'mp4',
+ 'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b',
+ 'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54',
+ 'categories': list, # NSFW
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ quality_arr = self._search_regex(
+ r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string')
+ formats = [{
+ 'url': fmt[0].replace('\\', ''),
+ 'format_id': fmt[1],
+ 'height': int(fmt[1][:3]),
+ } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)]
+ self._sort_formats(formats)
+
+ title = self._html_search_regex(
+ r'<title>([^<]+)\s*-\s*Sexu\.Com</title>', webpage, 'title')
+
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
+
+ thumbnail = self._html_search_regex(
+ r'image:\s*"([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+
+ categories_str = self._html_search_meta(
+ 'keywords', webpage, 'categories')
+ categories = (
+ None if categories_str is None
+ else categories_str.split(','))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'formats': formats,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py
index c833fc8ee..6446d26dc 100644
--- a/youtube_dl/extractor/sexykarma.py
+++ b/youtube_dl/extractor/sexykarma.py
@@ -24,7 +24,7 @@ class SexyKarmaIE(InfoExtractor):
'title': 'Taking a quick pee.',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'wildginger7',
- 'upload_date': '20141007',
+ 'upload_date': '20141008',
'duration': 22,
'view_count': int,
'comment_count': int,
@@ -45,6 +45,7 @@ class SexyKarmaIE(InfoExtractor):
'view_count': int,
'comment_count': int,
'categories': list,
+ 'age_limit': 18,
}
}, {
'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html',
@@ -61,6 +62,7 @@ class SexyKarmaIE(InfoExtractor):
'view_count': int,
'comment_count': int,
'categories': list,
+ 'age_limit': 18,
}
}]
@@ -114,4 +116,5 @@ class SexyKarmaIE(InfoExtractor):
'view_count': view_count,
'comment_count': comment_count,
'categories': categories,
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index badba2ac6..26ced716e 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -4,10 +4,12 @@ import re
import base64
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
- compat_urllib_parse,
int_or_none,
)
@@ -26,26 +28,30 @@ class SharedIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage(url, video_id)
-
- if re.search(r'>File does not exist<', page) is not None:
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
- download_form = dict(re.findall(r'<input type="hidden" name="([^"]+)" value="([^"]*)"', page))
+ if '>File does not exist<' in webpage:
+ raise ExtractorError(
+ 'Video %s does not exist' % video_id, expected=True)
- request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(download_form))
+ download_form = dict(re.findall(
+ r'<input type="hidden" name="([^"]+)" value="([^"]*)"', webpage))
+ request = compat_urllib_request.Request(
+ url, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- video_page = self._download_webpage(request, video_id, 'Downloading video page')
+ video_page = self._download_webpage(
+ request, video_id, 'Downloading video page')
- video_url = self._html_search_regex(r'data-url="([^"]+)"', video_page, 'video URL')
- title = base64.b64decode(self._html_search_meta('full:title', page, 'title')).decode('utf-8')
- filesize = int_or_none(self._html_search_meta('full:size', page, 'file size', fatal=False))
+ video_url = self._html_search_regex(
+ r'data-url="([^"]+)"', video_page, 'video URL')
+ title = base64.b64decode(self._html_search_meta(
+ 'full:title', webpage, 'title')).decode('utf-8')
+ filesize = int_or_none(self._html_search_meta(
+ 'full:size', webpage, 'file size', fatal=False))
thumbnail = self._html_search_regex(
- r'data-poster="([^"]+)"', video_page, 'thumbnail', fatal=False, default=None)
+ r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None)
return {
'id': video_id,
@@ -54,4 +60,4 @@ class SharedIE(InfoExtractor):
'filesize': filesize,
'title': title,
'thumbnail': thumbnail,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py
index 7531e8325..ac3e3adf2 100644
--- a/youtube_dl/extractor/sharesix.py
+++ b/youtube_dl/extractor/sharesix.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
parse_duration,
)
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py
index 2909ef18b..a63d126d4 100644
--- a/youtube_dl/extractor/sina.py
+++ b/youtube_dl/extractor/sina.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
compat_urllib_parse,
)
@@ -46,7 +46,7 @@ class SinaIE(InfoExtractor):
def _extract_video(self, video_id):
data = compat_urllib_parse.urlencode({'vid': video_id})
url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
- video_id, 'Downloading video url')
+ video_id, 'Downloading video url')
image_page = self._download_webpage(
'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
video_id, 'Downloading thumbnail info')
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py
index 53c3c9220..9f79ff5c1 100644
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@@ -4,8 +4,10 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
)
@@ -28,7 +30,7 @@ class SlideshareIE(InfoExtractor):
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex(
- r'var slideshare_object = ({.*?}); var user_info =',
+ r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=',
webpage, 'slideshare object')
info = json.loads(slideshare_obj)
if info['slideshow']['type'] != 'video':
@@ -39,7 +41,7 @@ class SlideshareIE(InfoExtractor):
ext = info['jsplayer']['video_extension']
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
description = self._html_search_regex(
- r'<p\s+(?:style="[^"]*"\s+)?class="description.*?"[^>]*>(.*?)</p>', webpage,
+ r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
'description', fatal=False)
return {
diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py
index e6e7d0865..3df71304d 100644
--- a/youtube_dl/extractor/slutload.py
+++ b/youtube_dl/extractor/slutload.py
@@ -26,7 +26,7 @@ class SlutloadIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>',
- webpage, 'title').strip()
+ webpage, 'title').strip()
video_url = self._html_search_regex(
r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"',
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 9bd5defa7..26f361c93 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -1,26 +1,27 @@
# encoding: utf-8
from __future__ import unicode_literals
-import os.path
import re
import json
import hashlib
import uuid
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
- url_basename,
int_or_none,
+ unified_strdate,
)
class SmotriIE(InfoExtractor):
IE_DESC = 'Smotri.com'
IE_NAME = 'smotri'
- _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
+ _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
_NETRC_MACHINE = 'smotri'
_TESTS = [
@@ -35,7 +36,6 @@ class SmotriIE(InfoExtractor):
'uploader': 'rbc2008',
'uploader_id': 'rbc08',
'upload_date': '20131118',
- 'description': 'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения',
'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
},
},
@@ -50,7 +50,6 @@ class SmotriIE(InfoExtractor):
'uploader': 'Support Photofile@photofile',
'uploader_id': 'support-photofile',
'upload_date': '20070704',
- 'description': 'test, видео test',
'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
},
},
@@ -66,11 +65,11 @@ class SmotriIE(InfoExtractor):
'uploader_id': 'timoxa40',
'upload_date': '20100404',
'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
- 'description': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
},
'params': {
'videopassword': 'qwerty',
},
+ 'skip': 'Video is not approved by moderator',
},
# age limit + video-password
{
@@ -85,11 +84,25 @@ class SmotriIE(InfoExtractor):
'upload_date': '20101001',
'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
'age_limit': 18,
- 'description': 'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ',
},
'params': {
'videopassword': '333'
- }
+ },
+ 'skip': 'Video is not approved by moderator',
+ },
+ # not approved by moderator, but available
+ {
+ 'url': 'http://smotri.com/video/view/?id=v28888533b73',
+ 'md5': 'f44bc7adac90af518ef1ecf04893bb34',
+ 'info_dict': {
+ 'id': 'v28888533b73',
+ 'ext': 'mp4',
+ 'title': 'Russian Spies Killed By ISIL Child Soldier',
+ 'uploader': 'Mopeder',
+ 'uploader_id': 'mopeder',
+ 'duration': 71,
+ 'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg',
+ },
},
# swf player
{
@@ -102,17 +115,11 @@ class SmotriIE(InfoExtractor):
'uploader': 'HannahL',
'uploader_id': 'lisaha95',
'upload_date': '20090331',
- 'description': 'Shakira - Don\'t Bother, видео Shakira - Don\'t Bother',
'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
},
},
]
- _SUCCESS = 0
- _PASSWORD_NOT_VERIFIED = 1
- _PASSWORD_DETECTED = 2
- _VIDEO_NOT_FOUND = 3
-
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
@@ -137,44 +144,47 @@ class SmotriIE(InfoExtractor):
return self._html_search_meta(name, html, display_name)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- real_video_id = mobj.group('realvideoid')
-
- # Download video JSON data
- video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id
- video_json_page = self._download_webpage(video_json_url, video_id, 'Downloading video JSON')
- video_json = json.loads(video_json_page)
-
- status = video_json['status']
- if status == self._VIDEO_NOT_FOUND:
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with
- # video-password set
- video_password = self._downloader.params.get('videopassword', None)
- if not video_password:
- raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
- video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest()
- video_json_page = self._download_webpage(video_json_url, video_id, 'Downloading video JSON (video-password set)')
- video_json = json.loads(video_json_page)
- status = video_json['status']
- if status == self._PASSWORD_NOT_VERIFIED:
- raise ExtractorError('Video password is invalid', expected=True)
-
- if status != self._SUCCESS:
- raise ExtractorError('Unexpected status value %s' % status)
-
- # Extract the URL of the video
- video_url = video_json['file_data']
+ video_id = self._match_id(url)
+
+ video_form = {
+ 'ticket': video_id,
+ 'video_url': '1',
+ 'frame_url': '1',
+ 'devid': 'LoadupFlashPlayer',
+ 'getvideoinfo': '1',
+ }
+
+ request = compat_urllib_request.Request(
+ 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+
+ video = self._download_json(request, video_id, 'Downloading video JSON')
+
+ video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
+
+ if not video_url:
+ if video.get('_moderate_no') or not video.get('moderated'):
+ raise ExtractorError(
+ 'Video %s has not been approved by moderator' % video_id, expected=True)
+
+ if video.get('error'):
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+ title = video['title']
+ thumbnail = video['_imgURL']
+ upload_date = unified_strdate(video['added'])
+ uploader = video['userNick']
+ uploader_id = video['userLogin']
+ duration = int_or_none(video['duration'])
# Video JSON does not provide enough meta data
# We will extract some from the video web page instead
- video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id
- video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page')
+ webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id
+ webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page')
# Warning if video is unavailable
warning = self._html_search_regex(
- r'<div class="videoUnModer">(.*?)</div>', video_page,
+ r'<div class="videoUnModer">(.*?)</div>', webpage,
'warning message', default=None)
if warning is not None:
self._downloader.report_warning(
@@ -182,84 +192,32 @@ class SmotriIE(InfoExtractor):
(video_id, warning))
# Adult content
- if re.search('EroConfirmText">', video_page) is not None:
+ if re.search('EroConfirmText">', webpage) is not None:
self.report_age_confirmation()
confirm_string = self._html_search_regex(
r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id,
- video_page, 'confirm string')
- confirm_url = video_page_url + '&confirm=%s' % confirm_string
- video_page = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)')
+ webpage, 'confirm string')
+ confirm_url = webpage_url + '&confirm=%s' % confirm_string
+ webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)')
adult_content = True
else:
adult_content = False
- # Extract the rest of meta data
- video_title = self._search_meta('name', video_page, 'title')
- if not video_title:
- video_title = os.path.splitext(url_basename(video_url))[0]
-
- video_description = self._search_meta('description', video_page)
- END_TEXT = ' на сайте Smotri.com'
- if video_description and video_description.endswith(END_TEXT):
- video_description = video_description[:-len(END_TEXT)]
- START_TEXT = 'Смотреть онлайн ролик '
- if video_description and video_description.startswith(START_TEXT):
- video_description = video_description[len(START_TEXT):]
- video_thumbnail = self._search_meta('thumbnail', video_page)
-
- upload_date_str = self._search_meta('uploadDate', video_page, 'upload date')
- if upload_date_str:
- upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
- video_upload_date = (
- (
- upload_date_m.group('year') +
- upload_date_m.group('month') +
- upload_date_m.group('day')
- )
- if upload_date_m else None
- )
- else:
- video_upload_date = None
-
- duration_str = self._search_meta('duration', video_page)
- if duration_str:
- duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
- video_duration = (
- (
- (int(duration_m.group('hours')) * 60 * 60) +
- (int(duration_m.group('minutes')) * 60) +
- int(duration_m.group('seconds'))
- )
- if duration_m else None
- )
- else:
- video_duration = None
-
- video_uploader = self._html_search_regex(
- '<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
- video_page, 'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL)
-
- video_uploader_id = self._html_search_regex(
- '<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">',
- video_page, 'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL)
-
- video_view_count = self._html_search_regex(
+ view_count = self._html_search_regex(
'Общее количество просмотров.*?<span class="Number">(\\d+)</span>',
- video_page, 'view count', fatal=False, flags=re.MULTILINE|re.DOTALL)
+ webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL)
return {
'id': video_id,
'url': video_url,
- 'title': video_title,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'uploader': video_uploader,
- 'upload_date': video_upload_date,
- 'uploader_id': video_uploader_id,
- 'duration': video_duration,
- 'view_count': int_or_none(video_view_count),
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'view_count': int_or_none(view_count),
'age_limit': 18 if adult_content else 0,
- 'video_page_url': video_page_url
}
@@ -275,7 +233,7 @@ class SmotriCommunityIE(InfoExtractor):
},
'playlist_mincount': 4,
}
-
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
community_id = mobj.group('communityid')
@@ -337,15 +295,18 @@ class SmotriBroadcastIE(InfoExtractor):
broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page')
if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None:
- raise ExtractorError('Broadcast %s does not exist' % broadcast_id, expected=True)
+ raise ExtractorError(
+ 'Broadcast %s does not exist' % broadcast_id, expected=True)
# Adult content
if re.search('EroConfirmText">', broadcast_page) is not None:
(username, password) = self._get_login_info()
if username is None:
- raise ExtractorError('Erotic broadcasts allowed only for registered users, '
- 'use --username and --password options to provide account credentials.', expected=True)
+ raise ExtractorError(
+ 'Erotic broadcasts allowed only for registered users, '
+ 'use --username and --password options to provide account credentials.',
+ expected=True)
login_form = {
'login-hint53': '1',
@@ -354,9 +315,11 @@ class SmotriBroadcastIE(InfoExtractor):
'password': password,
}
- request = compat_urllib_request.Request(broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form))
+ request = compat_urllib_request.Request(
+ broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- broadcast_page = self._download_webpage(request, broadcast_id, 'Logging in and confirming age')
+ broadcast_page = self._download_webpage(
+ request, broadcast_id, 'Logging in and confirming age')
if re.search('>Неверный логин или пароль<', broadcast_page) is not None:
raise ExtractorError('Unable to log in: bad username or password', expected=True)
@@ -366,7 +329,7 @@ class SmotriBroadcastIE(InfoExtractor):
adult_content = False
ticket = self._html_search_regex(
- 'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);',
+ r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)",
broadcast_page, 'broadcast ticket')
url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
@@ -375,26 +338,31 @@ class SmotriBroadcastIE(InfoExtractor):
if broadcast_password:
url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
- broadcast_json_page = self._download_webpage(url, broadcast_id, 'Downloading broadcast JSON')
+ broadcast_json_page = self._download_webpage(
+ url, broadcast_id, 'Downloading broadcast JSON')
try:
broadcast_json = json.loads(broadcast_json_page)
protected_broadcast = broadcast_json['_pass_protected'] == 1
if protected_broadcast and not broadcast_password:
- raise ExtractorError('This broadcast is protected by a password, use the --video-password option', expected=True)
+ raise ExtractorError(
+ 'This broadcast is protected by a password, use the --video-password option',
+ expected=True)
broadcast_offline = broadcast_json['is_play'] == 0
if broadcast_offline:
raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True)
rtmp_url = broadcast_json['_server']
- if not rtmp_url.startswith('rtmp://'):
+ mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url)
+ if not mobj:
raise ExtractorError('Unexpected broadcast rtmp URL')
broadcast_playpath = broadcast_json['_streamName']
+ broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL'])
broadcast_thumbnail = broadcast_json['_imgURL']
- broadcast_title = broadcast_json['title']
+ broadcast_title = self._live_title(broadcast_json['title'])
broadcast_description = broadcast_json['description']
broadcaster_nick = broadcast_json['nick']
broadcaster_login = broadcast_json['login']
@@ -415,6 +383,9 @@ class SmotriBroadcastIE(InfoExtractor):
'age_limit': 18 if adult_content else 0,
'ext': 'flv',
'play_path': broadcast_playpath,
+ 'player_url': 'http://pics.smotri.com/broadcast_play.swf',
+ 'app': broadcast_app,
'rtmp_live': True,
- 'rtmp_conn': rtmp_conn
+ 'rtmp_conn': rtmp_conn,
+ 'is_live': True,
}
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
index c663e56d4..7d3c0e937 100644
--- a/youtube_dl/extractor/sockshare.py
+++ b/youtube_dl/extractor/sockshare.py
@@ -1,13 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
-from ..utils import (
- ExtractorError,
+import re
+
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
determine_ext,
+ ExtractorError,
)
-import re
from .common import InfoExtractor
@@ -27,9 +30,7 @@ class SockshareIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
url = 'http://sockshare.com/file/%s' % video_id
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index bebcafb62..c04791997 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -1,23 +1,24 @@
# encoding: utf-8
+from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from .common import compat_str
class SohuIE(InfoExtractor):
_VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'
_TEST = {
- u'url': u'http://tv.sohu.com/20130724/n382479172.shtml#super',
- u'file': u'382479172.mp4',
- u'md5': u'bde8d9a6ffd82c63a1eefaef4eeefec7',
- u'info_dict': {
- u'title': u'MV:Far East Movement《The Illest》',
+ 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super',
+ 'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7',
+ 'info_dict': {
+ 'id': '382479172',
+ 'ext': 'mp4',
+ 'title': 'MV:Far East Movement《The Illest》',
},
- u'skip': u'Only available from China',
+ 'skip': 'Only available from China',
}
def _real_extract(self, url):
@@ -26,61 +27,74 @@ class SohuIE(InfoExtractor):
if mytv:
base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid='
else:
- base_data_url = u'http://hot.vrs.sohu.com/vrs_flash.action?vid='
- data_url = base_data_url + str(vid_id)
- data_json = self._download_webpage(
- data_url, video_id,
- note=u'Downloading JSON data for ' + str(vid_id))
- return json.loads(data_json)
+ base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
+
+ return self._download_json(
+ base_data_url + vid_id, video_id,
+ 'Downloading JSON data for %s' % vid_id)
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
mytv = mobj.group('mytv') is not None
webpage = self._download_webpage(url, video_id)
- raw_title = self._html_search_regex(r'(?s)<title>(.+?)</title>',
- webpage, u'video title')
+ raw_title = self._html_search_regex(
+ r'(?s)<title>(.+?)</title>',
+ webpage, 'video title')
title = raw_title.partition('-')[0].strip()
- vid = self._html_search_regex(r'var vid ?= ?["\'](\d+)["\']', webpage,
- u'video path')
- data = _fetch_data(vid, mytv)
-
- QUALITIES = ('ori', 'super', 'high', 'nor')
- vid_ids = [data['data'][q + 'Vid']
- for q in QUALITIES
- if data['data'][q + 'Vid'] != 0]
- if not vid_ids:
- raise ExtractorError(u'No formats available for this video')
+ vid = self._html_search_regex(
+ r'var vid ?= ?["\'](\d+)["\']',
+ webpage, 'video path')
+ vid_data = _fetch_data(vid, mytv)
- # For now, we just pick the highest available quality
- vid_id = vid_ids[-1]
+ formats_json = {}
+ for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
+ vid_id = vid_data['data'].get('%sVid' % format_id)
+ if not vid_id:
+ continue
+ vid_id = compat_str(vid_id)
+ formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv)
- format_data = data if vid == vid_id else _fetch_data(vid_id, mytv)
- part_count = format_data['data']['totalBlocks']
- allot = format_data['allot']
- prot = format_data['prot']
- clipsURL = format_data['data']['clipsURL']
- su = format_data['data']['su']
+ part_count = vid_data['data']['totalBlocks']
playlist = []
for i in range(part_count):
- part_url = ('http://%s/?prot=%s&file=%s&new=%s' %
- (allot, prot, clipsURL[i], su[i]))
- part_str = self._download_webpage(
- part_url, video_id,
- note=u'Downloading part %d of %d' % (i+1, part_count))
-
- part_info = part_str.split('|')
- video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
-
- video_info = {
- 'id': '%s_part%02d' % (video_id, i + 1),
+ formats = []
+ for format_id, format_data in formats_json.items():
+ allot = format_data['allot']
+ prot = format_data['prot']
+
+ data = format_data['data']
+ clips_url = data['clipsURL']
+ su = data['su']
+
+ part_str = self._download_webpage(
+ 'http://%s/?prot=%s&file=%s&new=%s' %
+ (allot, prot, clips_url[i], su[i]),
+ video_id,
+ 'Downloading %s video URL part %d of %d'
+ % (format_id, i + 1, part_count))
+
+ part_info = part_str.split('|')
+ video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
+
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'filesize': data['clipsBytes'][i],
+ 'width': data['width'],
+ 'height': data['height'],
+ 'fps': data['fps'],
+ })
+ self._sort_formats(formats)
+
+ playlist.append({
+ 'id': '%s_part%d' % (video_id, i + 1),
'title': title,
- 'url': video_url,
- 'ext': 'mp4',
- }
- playlist.append(video_info)
+ 'duration': vid_data['data']['clipsDuration'][i],
+ 'formats': formats,
+ })
if len(playlist) == 1:
info = playlist[0]
diff --git a/youtube_dl/extractor/soulanime.py b/youtube_dl/extractor/soulanime.py
new file mode 100644
index 000000000..feef33e27
--- /dev/null
+++ b/youtube_dl/extractor/soulanime.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ HEADRequest,
+ urlhandle_detect_ext,
+)
+
+
+class SoulAnimeWatchingIE(InfoExtractor):
+ IE_NAME = "soulanime:watching"
+ IE_DESC = "SoulAnime video"
+ _TEST = {
+ 'url': 'http://www.soul-anime.net/watching/seirei-tsukai-no-blade-dance-episode-9/',
+ 'md5': '05fae04abf72298098b528e98abf4298',
+ 'info_dict': {
+ 'id': 'seirei-tsukai-no-blade-dance-episode-9',
+ 'ext': 'mp4',
+ 'title': 'seirei-tsukai-no-blade-dance-episode-9',
+ 'description': 'seirei-tsukai-no-blade-dance-episode-9'
+ }
+ }
+ _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/watch[^/]*/(?P<id>[^/]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ domain = mobj.group('domain')
+
+ page = self._download_webpage(url, video_id)
+
+ video_url_encoded = self._html_search_regex(
+ r'<div id="download">[^<]*<a href="(?P<url>[^"]+)"', page, 'url')
+ video_url = "http://www.soul-anime." + domain + video_url_encoded
+
+ ext_req = HEADRequest(video_url)
+ ext_handle = self._request_webpage(
+ ext_req, video_id, note='Determining extension')
+ ext = urlhandle_detect_ext(ext_handle)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': video_id,
+ 'description': video_id
+ }
+
+
+class SoulAnimeSeriesIE(InfoExtractor):
+ IE_NAME = "soulanime:series"
+ IE_DESC = "SoulAnime Series"
+
+ _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/anime./(?P<id>[^/]+)'
+
+ _EPISODE_REGEX = r'<option value="(/watch[^/]*/[^"]+)">[^<]*</option>'
+
+ _TEST = {
+ 'url': 'http://www.soul-anime.net/anime1/black-rock-shooter-tv/',
+ 'info_dict': {
+ 'id': 'black-rock-shooter-tv'
+ },
+ 'playlist_count': 8
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ series_id = mobj.group('id')
+ domain = mobj.group('domain')
+
+ pattern = re.compile(self._EPISODE_REGEX)
+
+ page = self._download_webpage(url, series_id, "Downloading series page")
+ mobj = pattern.findall(page)
+
+ entries = [self.url_result("http://www.soul-anime." + domain + obj) for obj in mobj]
+
+ return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 4719ba45c..5d60c4939 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -5,11 +5,12 @@ import re
import itertools
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
compat_urlparse,
compat_urllib_parse,
-
+)
+from ..utils import (
ExtractorError,
int_or_none,
unified_strdate,
@@ -32,7 +33,7 @@ class SoundcloudIE(InfoExtractor):
(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
- (?:/?\?secret_token=(?P<secret_token>[^&]+?))?$)
+ (?:/?\?secret_token=(?P<secret_token>[^&]+))?)
|(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
)
'''
@@ -40,14 +41,15 @@ class SoundcloudIE(InfoExtractor):
_TESTS = [
{
'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
- 'file': '62986583.mp3',
'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
'info_dict': {
- "upload_date": "20121011",
- "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
- "uploader": "E.T. ExTerrestrial Music",
- "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1",
- "duration": 143,
+ 'id': '62986583',
+ 'ext': 'mp3',
+ 'upload_date': '20121011',
+ 'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
+ 'uploader': 'E.T. ExTerrestrial Music',
+ 'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
+ 'duration': 143,
}
},
# not streamable song
@@ -103,7 +105,7 @@ class SoundcloudIE(InfoExtractor):
'id': '128590877',
'ext': 'mp3',
'title': 'Bus Brakes',
- 'description': 'md5:0170be75dd395c96025d210d261c784e',
+ 'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
'uploader': 'oddsamples',
'upload_date': '20140109',
'duration': 17,
@@ -140,6 +142,7 @@ class SoundcloudIE(InfoExtractor):
'description': info['description'],
'thumbnail': thumbnail,
'duration': int_or_none(info.get('duration'), 1000),
+ 'webpage_url': info.get('permalink_url'),
}
formats = []
if info.get('downloadable', False):
@@ -157,7 +160,7 @@ class SoundcloudIE(InfoExtractor):
# We have to retrieve the url
streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
- 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
+ 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
format_dict = self._download_json(
streams_url,
track_id, 'Downloading track url')
@@ -222,14 +225,14 @@ class SoundcloudIE(InfoExtractor):
# extract uploader (which is in the url)
uploader = mobj.group('uploader')
# extract simple title (uploader + slug of song title)
- slug_title = mobj.group('title')
+ slug_title = mobj.group('title')
token = mobj.group('token')
full_title = resolve_title = '%s/%s' % (uploader, slug_title)
if token:
resolve_title += '/%s' % token
-
+
self.report_resolve(full_title)
-
+
url = 'http://soundcloud.com/%s' % resolve_title
info_json_url = self._resolv_url(url)
info = self._download_json(info_json_url, full_title, 'Downloading info JSON')
@@ -369,7 +372,7 @@ class SoundcloudPlaylistIE(SoundcloudIE):
entries = [
self._extract_info_dict(t, quiet=True, secret_token=token)
- for t in data['tracks']]
+ for t in data['tracks']]
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
index d34aefeaa..c2d0d36a6 100644
--- a/youtube_dl/extractor/space.py
+++ b/youtube_dl/extractor/space.py
@@ -33,5 +33,6 @@ class SpaceIE(InfoExtractor):
# Other videos works fine with the info from the object
brightcove_url = BrightcoveIE._extract_brightcove_url(webpage)
if brightcove_url is None:
- raise ExtractorError(u'The webpage does not contain a video', expected=True)
+ raise ExtractorError(
+ 'The webpage does not contain a video', expected=True)
return self.url_result(brightcove_url, BrightcoveIE.ie_key())
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index 94602e89e..b936202f6 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -3,12 +3,14 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
- unified_strdate,
+)
+from ..utils import (
str_to_int,
+ unified_strdate,
)
from ..aes import aes_decrypt_text
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index 9ed7d3b39..f345883c7 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -4,11 +4,19 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import compat_urlparse
+from ..compat import (
+ compat_urlparse,
+ compat_HTTPError,
+)
+from ..utils import (
+ HEADRequest,
+ ExtractorError,
+)
+from .spiegeltv import SpiegeltvIE
class SpiegelIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
+ _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$'
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
'md5': '2c2754212136f35fb4b19767d242f66e',
@@ -29,16 +37,28 @@ class SpiegelIE(InfoExtractor):
'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
'duration': 983,
},
+ }, {
+ 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
+ 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51',
+ 'info_dict': {
+ 'id': '1519126',
+ 'ext': 'mp4',
+ 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
+ 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
+ }
}]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('videoID')
+ video_id = self._match_id(url)
+ webpage, handle = self._download_webpage_handle(url, video_id)
- webpage = self._download_webpage(url, video_id)
+ # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html
+ if SpiegeltvIE.suitable(handle.geturl()):
+ return self.url_result(handle.geturl(), 'Spiegeltv')
- title = self._html_search_regex(
- r'<div class="module-title">(.*?)</div>', webpage, 'title')
+ title = re.sub(r'\s+', ' ', self._html_search_regex(
+ r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>',
+ webpage, 'title'))
description = self._html_search_meta('description', webpage, 'description')
base_url = self._search_regex(
@@ -47,21 +67,31 @@ class SpiegelIE(InfoExtractor):
xml_url = base_url + video_id + '.xml'
idoc = self._download_xml(xml_url, video_id)
- formats = [
- {
- 'format_id': n.tag.rpartition('type')[2],
- 'url': base_url + n.find('./filename').text,
- 'width': int(n.find('./width').text),
- 'height': int(n.find('./height').text),
- 'abr': int(n.find('./audiobitrate').text),
- 'vbr': int(n.find('./videobitrate').text),
- 'vcodec': n.find('./codec').text,
- 'acodec': 'MP4A',
- }
- for n in list(idoc)
- # Blacklist type 6, it's extremely LQ and not available on the same server
- if n.tag.startswith('type') and n.tag != 'type6'
- ]
+ formats = []
+ for n in list(idoc):
+ if n.tag.startswith('type') and n.tag != 'type6':
+ format_id = n.tag.rpartition('type')[2]
+ video_url = base_url + n.find('./filename').text
+ # Test video URLs beforehand as some of them are invalid
+ try:
+ self._request_webpage(
+ HEADRequest(video_url), video_id,
+ 'Checking %s video URL' % format_id)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ self.report_warning(
+ '%s video URL is invalid, skipping' % format_id, video_id)
+ continue
+ formats.append({
+ 'format_id': format_id,
+ 'url': video_url,
+ 'width': int(n.find('./width').text),
+ 'height': int(n.find('./height').text),
+ 'abr': int(n.find('./audiobitrate').text),
+ 'vbr': int(n.find('./videobitrate').text),
+ 'vcodec': n.find('./codec').text,
+ 'acodec': 'MP4A',
+ })
duration = float(idoc[0].findall('./duration')[0].text)
self._sort_formats(formats)
@@ -79,7 +109,7 @@ class SpiegelArticleIE(InfoExtractor):
_VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'
IE_NAME = 'Spiegel:Article'
IE_DESC = 'Articles on spiegel.de'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
'info_dict': {
'id': '1516455',
@@ -87,20 +117,34 @@ class SpiegelArticleIE(InfoExtractor):
'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
'description': 're:^Patrick Kämnitz gehört.{100,}',
},
- }
+ }, {
+ 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
+ 'info_dict': {
+
+ },
+ 'playlist_count': 6,
+ }]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
+ # Single video on top of the page
video_link = self._search_regex(
r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage,
- 'video page URL')
- video_url = compat_urlparse.urljoin(
- self.http_scheme() + '//spiegel.de/', video_link)
-
- return {
- '_type': 'url',
- 'url': video_url,
- }
+ 'video page URL', default=None)
+ if video_link:
+ video_url = compat_urlparse.urljoin(
+ self.http_scheme() + '//spiegel.de/', video_link)
+ return self.url_result(video_url)
+
+ # Multiple embedded videos
+ embeds = re.findall(
+ r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"',
+ webpage)
+ entries = [
+ self.url_result(compat_urlparse.urljoin(
+ self.http_scheme() + '//spiegel.de/', embed_path))
+ for embed_path in embeds
+ ]
+ return self.playlist_result(entries)
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
index 7f388aced..98cf92d89 100644
--- a/youtube_dl/extractor/spiegeltv.py
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -1,13 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
+from ..utils import float_or_none
class SpiegeltvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/filme/(?P<id>[\-a-z0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/(?:#/)?filme/(?P<id>[\-a-z0-9]+)'
+ _TESTS = [{
'url': 'http://www.spiegel.tv/filme/flug-mh370/',
'info_dict': {
'id': 'flug-mh370',
@@ -20,12 +20,15 @@ class SpiegeltvIE(InfoExtractor):
# rtmp download
'skip_download': True,
}
- }
+ }, {
+ 'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ if '/#/' in url:
+ url = url.replace('/#/', '/')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title')
@@ -61,12 +64,8 @@ class SpiegeltvIE(InfoExtractor):
})
description = media_json['subtitle']
- duration = media_json['duration_in_ms'] / 1000.
-
- if is_wide:
- format = '16x9'
- else:
- format = '4x3'
+ duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
+ format = '16x9' if is_wide else '4x3'
url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
@@ -78,4 +77,4 @@ class SpiegeltvIE(InfoExtractor):
'description': description,
'duration': duration,
'thumbnails': thumbnails
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py
index 3f680bfc6..dfe50ed45 100644
--- a/youtube_dl/extractor/sport5.py
+++ b/youtube_dl/extractor/sport5.py
@@ -89,4 +89,4 @@ class Sport5IE(InfoExtractor):
'duration': duration,
'categories': categories,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index 19cc976e3..becdf658f 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -7,7 +7,6 @@ from .common import InfoExtractor
from ..utils import (
parse_duration,
parse_iso8601,
- int_or_none,
)
@@ -26,7 +25,6 @@ class SportBoxIE(InfoExtractor):
'timestamp': 1411896237,
'upload_date': '20140928',
'duration': 4846,
- 'view_count': int,
},
'params': {
# m3u8 download
@@ -65,8 +63,6 @@ class SportBoxIE(InfoExtractor):
r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
duration = parse_duration(self._html_search_regex(
r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
- view_count = int_or_none(self._html_search_regex(
- r'<span>Просмотров: (\d+)</span>', player, 'view count', fatal=False))
return {
'id': video_id,
@@ -76,6 +72,5 @@ class SportBoxIE(InfoExtractor):
'thumbnail': thumbnail,
'timestamp': timestamp,
'duration': duration,
- 'view_count': view_count,
'formats': formats,
}
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py
index abb827783..1a57aebf1 100644
--- a/youtube_dl/extractor/sportdeutschland.py
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -4,8 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
parse_iso8601,
)
@@ -58,9 +60,10 @@ class SportDeutschlandIE(InfoExtractor):
categories = list(data.get('section', {}).get('tags', {}).values())
asset = data['asset']
+ assets_info = self._download_json(asset['url'], video_id)
formats = []
- smil_url = asset['video']
+ smil_url = assets_info['video']
if '.smil' in smil_url:
m3u8_url = smil_url.replace('.smil', '.m3u8')
formats.extend(
@@ -93,4 +96,3 @@ class SportDeutschlandIE(InfoExtractor):
'rtmp_live': asset.get('live'),
'timestamp': parse_iso8601(asset.get('date')),
}
-
diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py
new file mode 100644
index 000000000..666a7dcc8
--- /dev/null
+++ b/youtube_dl/extractor/srmediathek.py
@@ -0,0 +1,43 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class SRMediathekIE(InfoExtractor):
+ IE_DESC = 'Süddeutscher Rundfunk'
+ _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455',
+ 'info_dict': {
+ 'id': '28455',
+ 'ext': 'mp4',
+ 'title': 'sportarena (26.10.2014)',
+ 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ murls = json.loads(js_to_json(self._search_regex(
+ r'var mediaURLs\s*=\s*(.*?);\n', webpage, 'video URLs')))
+ formats = [{'url': murl} for murl in murls]
+ self._sort_formats(formats)
+
+ title = json.loads(js_to_json(self._search_regex(
+ r'var mediaTitles\s*=\s*(.*?);\n', webpage, 'title')))[0]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py
index 44c52c718..4a3d8bb8f 100644
--- a/youtube_dl/extractor/stanfordoc.py
+++ b/youtube_dl/extractor/stanfordoc.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -9,24 +11,23 @@ from ..utils import (
class StanfordOpenClassroomIE(InfoExtractor):
- IE_NAME = u'stanfordoc'
- IE_DESC = u'Stanford Open ClassRoom'
- _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
+ IE_NAME = 'stanfordoc'
+ IE_DESC = 'Stanford Open ClassRoom'
+ _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
_TEST = {
- u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
- u'file': u'PracticalUnix_intro-environment.mp4',
- u'md5': u'544a9468546059d4e80d76265b0443b8',
- u'info_dict': {
- u"title": u"Intro Environment"
+ 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
+ 'md5': '544a9468546059d4e80d76265b0443b8',
+ 'info_dict': {
+ 'id': 'PracticalUnix_intro-environment',
+ 'ext': 'mp4',
+ 'title': 'Intro Environment',
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- if mobj.group('course') and mobj.group('video'): # A specific video
+ if mobj.group('course') and mobj.group('video'): # A specific video
course = mobj.group('course')
video = mobj.group('video')
info = {
@@ -35,7 +36,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
'upload_date': None,
}
- self.report_extraction(info['id'])
baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
xmlUrl = baseUrl + video + '.xml'
mdoc = self._download_xml(xmlUrl, info['id'])
@@ -43,63 +43,49 @@ class StanfordOpenClassroomIE(InfoExtractor):
info['title'] = mdoc.findall('./title')[0].text
info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
except IndexError:
- raise ExtractorError(u'Invalid metadata XML file')
- info['ext'] = info['url'].rpartition('.')[2]
- return [info]
- elif mobj.group('course'): # A course page
+ raise ExtractorError('Invalid metadata XML file')
+ return info
+ elif mobj.group('course'): # A course page
course = mobj.group('course')
info = {
'id': course,
- 'type': 'playlist',
+ '_type': 'playlist',
'uploader': None,
'upload_date': None,
}
- coursepage = self._download_webpage(url, info['id'],
- note='Downloading course info page',
- errnote='Unable to download course info page')
+ coursepage = self._download_webpage(
+ url, info['id'],
+ note='Downloading course info page',
+ errnote='Unable to download course info page')
- info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
+ info['title'] = self._html_search_regex(
+ r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
- info['description'] = self._html_search_regex('<description>([^<]+)</description>',
- coursepage, u'description', fatal=False)
+ info['description'] = self._html_search_regex(
+ r'(?s)<description>([^<]+)</description>',
+ coursepage, 'description', fatal=False)
links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
- info['list'] = [
- {
- 'type': 'reference',
- 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
- }
- for vpage in links]
- results = []
- for entry in info['list']:
- assert entry['type'] == 'reference'
- results += self.extract(entry['url'])
- return results
- else: # Root page
+ info['entries'] = [self.url_result(
+ 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)
+ ) for l in links]
+ return info
+ else: # Root page
info = {
'id': 'Stanford OpenClassroom',
- 'type': 'playlist',
+ '_type': 'playlist',
'uploader': None,
'upload_date': None,
}
+ info['title'] = info['id']
rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
rootpage = self._download_webpage(rootURL, info['id'],
- errnote=u'Unable to download course info page')
-
- info['title'] = info['id']
+ errnote='Unable to download course info page')
links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
- info['list'] = [
- {
- 'type': 'reference',
- 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
- }
- for cpage in links]
-
- results = []
- for entry in info['list']:
- assert entry['type'] == 'reference'
- results += self.extract(entry['url'])
- return results
+ info['entries'] = [self.url_result(
+ 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)
+ ) for l in links]
+ return info
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py
index 172def221..d4e134015 100644
--- a/youtube_dl/extractor/streamcloud.py
+++ b/youtube_dl/extractor/streamcloud.py
@@ -2,10 +2,9 @@
from __future__ import unicode_literals
import re
-import time
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -13,7 +12,7 @@ from ..utils import (
class StreamcloudIE(InfoExtractor):
IE_NAME = 'streamcloud.eu'
- _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+ _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?'
_TEST = {
'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
@@ -27,8 +26,8 @@ class StreamcloudIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+ url = 'http://streamcloud.eu/%s' % video_id
orig_webpage = self._download_webpage(url, video_id)
@@ -40,8 +39,7 @@ class StreamcloudIE(InfoExtractor):
''', orig_webpage)
post = compat_urllib_parse.urlencode(fields)
- self.to_screen('%s: Waiting for timeout' % video_id)
- time.sleep(12)
+ self._sleep(12, video_id)
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
}
diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py
index 73efe9542..c3ceb5f76 100644
--- a/youtube_dl/extractor/streamcz.py
+++ b/youtube_dl/extractor/streamcz.py
@@ -1,18 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
from ..utils import (
int_or_none,
- compat_str,
)
class StreamCZIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)'
+ _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
@@ -21,61 +17,63 @@ class StreamCZIE(InfoExtractor):
'id': '765767',
'ext': 'mp4',
'title': 'Peklo na talíři: Éčka pro děti',
- 'description': 'md5:49ace0df986e95e331d0fe239d421519',
- 'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
+ 'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE',
+ 'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
'duration': 256,
},
}, {
'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
- 'md5': '246272e753e26bbace7fcd9deca0650c',
+ 'md5': 'e54a254fb8b871968fd8403255f28589',
'info_dict': {
'id': '10002447',
'ext': 'mp4',
'title': 'Kancelář Blaník: Tři roky pro Mazánka',
- 'description': 'md5:9177695a8b756a0a8ab160de4043b392',
- 'thumbnail': 'http://im.stream.cz/episode/537f838c50c11f8d21320000',
+ 'description': 'md5:3862a00ba7bf0b3e44806b544032c859',
+ 'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000',
'duration': 368,
},
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
-
- webpage = self._download_webpage(url, video_id)
-
- data = self._html_search_regex(r'Stream\.Data\.Episode\((.+?)\);', webpage, 'stream data')
-
- jsonData = json.loads(data)
+ video_id = self._match_id(url)
+ data = self._download_json(
+ 'http://www.stream.cz/API/episode/%s' % video_id, video_id)
formats = []
- for video in jsonData['instances']:
- for video_format in video['instances']:
- format_id = video_format['quality']
-
- if format_id == '240p':
- quality = 0
- elif format_id == '360p':
- quality = 1
- elif format_id == '480p':
- quality = 2
- elif format_id == '720p':
- quality = 3
-
+ for quality, video in enumerate(data['video_qualities']):
+ for f in video['formats']:
+ typ = f['type'].partition('/')[2]
+ qlabel = video.get('quality_label')
formats.append({
- 'format_id': '%s-%s' % (video_format['type'].split('/')[1], format_id),
- 'url': video_format['source'],
+ 'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ,
+ 'format_id': '%s-%s' % (typ, f['quality']),
+ 'url': f['source'],
+ 'height': int_or_none(f['quality'].rstrip('p')),
'quality': quality,
})
-
self._sort_formats(formats)
+ image = data.get('image')
+ if image:
+ thumbnail = self._proto_relative_url(
+ image.replace('{width}', '1240').replace('{height}', '697'),
+ scheme='http:',
+ )
+ else:
+ thumbnail = None
+
+ stream = data.get('_embedded', {}).get('stream:show', {}).get('name')
+ if stream:
+ title = '%s: %s' % (stream, data['name'])
+ else:
+ title = data['name']
+
return {
- 'id': compat_str(jsonData['episode_id']),
- 'title': self._og_search_title(webpage),
- 'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'),
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
'formats': formats,
- 'description': self._og_search_description(webpage),
- 'duration': int_or_none(jsonData['duration']),
- 'view_count': int_or_none(jsonData['stats_total']),
+ 'description': data.get('web_site_text'),
+ 'duration': int_or_none(data.get('duration')),
+ 'view_count': int_or_none(data.get('views')),
}
diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py
new file mode 100644
index 000000000..6a57fa60a
--- /dev/null
+++ b/youtube_dl/extractor/streetvoice.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import unified_strdate
+
+
+class StreetVoiceIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://streetvoice.com/skippylu/songs/94440/',
+ 'md5': '15974627fc01a29e492c98593c2fd472',
+ 'info_dict': {
+ 'id': '94440',
+ 'ext': 'mp3',
+ 'filesize': 4167053,
+ 'title': '輸',
+ 'description': 'Crispy脆樂團 - 輸',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 260,
+ 'upload_date': '20091018',
+ 'uploader': 'Crispy脆樂團',
+ 'uploader_id': '627810',
+ }
+ }, {
+ 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ song_id = self._match_id(url)
+
+ song = self._download_json(
+ 'http://streetvoice.com/music/api/song/%s' % song_id, song_id)
+
+ title = song['name']
+ author = song['musician']['name']
+
+ return {
+ 'id': song_id,
+ 'url': song['file'],
+ 'filesize': song.get('size'),
+ 'title': title,
+ 'description': '%s - %s' % (author, title),
+ 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
+ 'duration': song.get('length'),
+ 'upload_date': unified_strdate(song.get('created_at')),
+ 'uploader': author,
+ 'uploader_id': compat_str(song['musician']['id']),
+ }
diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py
index db33745c1..59a51268d 100644
--- a/youtube_dl/extractor/subtitles.py
+++ b/youtube_dl/extractor/subtitles.py
@@ -1,7 +1,8 @@
+from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- compat_str,
ExtractorError,
)
@@ -17,10 +18,10 @@ class SubtitlesInfoExtractor(InfoExtractor):
sub_lang_list = self._get_available_subtitles(video_id, webpage)
auto_captions_list = self._get_available_automatic_caption(video_id, webpage)
sub_lang = ",".join(list(sub_lang_list.keys()))
- self.to_screen(u'%s: Available subtitles for video: %s' %
+ self.to_screen('%s: Available subtitles for video: %s' %
(video_id, sub_lang))
auto_lang = ",".join(auto_captions_list.keys())
- self.to_screen(u'%s: Available automatic captions for video: %s' %
+ self.to_screen('%s: Available automatic captions for video: %s' %
(video_id, auto_lang))
def extract_subtitles(self, video_id, webpage):
@@ -50,8 +51,8 @@ class SubtitlesInfoExtractor(InfoExtractor):
sub_lang_list = {}
for sub_lang in requested_langs:
- if not sub_lang in available_subs_list:
- self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
+ if sub_lang not in available_subs_list:
+ self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang)
continue
sub_lang_list[sub_lang] = available_subs_list[sub_lang]
@@ -70,10 +71,10 @@ class SubtitlesInfoExtractor(InfoExtractor):
try:
sub = self._download_subtitle_url(sub_lang, url)
except ExtractorError as err:
- self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
+ self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
return
if not sub:
- self._downloader.report_warning(u'Did not fetch video subtitles')
+ self._downloader.report_warning('Did not fetch video subtitles')
return
return sub
@@ -94,5 +95,5 @@ class SubtitlesInfoExtractor(InfoExtractor):
Must be redefined by the subclasses that support automatic captions,
otherwise it will return {}
"""
- self._downloader.report_warning(u'Automatic Captions not supported by this server')
+ self._downloader.report_warning('Automatic Captions not supported by this server')
return {}
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py
index 263f09b46..8a333f1d2 100644
--- a/youtube_dl/extractor/sunporno.py
+++ b/youtube_dl/extractor/sunporno.py
@@ -28,23 +28,27 @@ class SunPornoIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
- description = self._html_search_meta('description', webpage, 'description')
+ title = self._html_search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
thumbnail = self._html_search_regex(
r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
duration = parse_duration(self._search_regex(
- r'Duration:\s*(\d+:\d+)\s*<', webpage, 'duration', fatal=False))
+ r'itemprop="duration">\s*(\d+:\d+)\s*<',
+ webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
- r'class="views">\s*(\d+)\s*<', webpage, 'view count', fatal=False))
+ r'class="views">\s*(\d+)\s*<',
+ webpage, 'view count', fatal=False))
comment_count = int_or_none(self._html_search_regex(
- r'(\d+)</b> Comments?', webpage, 'comment count', fatal=False))
+ r'(\d+)</b> Comments?',
+ webpage, 'comment count', fatal=False))
formats = []
quality = qualities(['mp4', 'flv'])
diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py
index 13c6ea677..58073eefe 100644
--- a/youtube_dl/extractor/swrmediathek.py
+++ b/youtube_dl/extractor/swrmediathek.py
@@ -80,7 +80,7 @@ class SWRMediathekIE(InfoExtractor):
if media_type == 'Video':
fmt.update({
- 'format_note': ['144p', '288p', '544p'][quality-1],
+ 'format_note': ['144p', '288p', '544p', '720p'][quality - 1],
'vcodec': codec,
})
elif media_type == 'Audio':
@@ -101,4 +101,4 @@ class SWRMediathekIE(InfoExtractor):
'uploader': attr['channel_title'],
'uploader_id': attr['channel_idkey'],
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py
index f76b6e2b2..5ca079f88 100644
--- a/youtube_dl/extractor/syfy.py
+++ b/youtube_dl/extractor/syfy.py
@@ -10,7 +10,6 @@ class SyfyIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458',
- 'md5': 'e07de1d52c7278adbb9b9b1c93a66849',
'info_dict': {
'id': 'NmqMrGnXvmO1',
'ext': 'flv',
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py
index c9359fafb..aa5964acb 100644
--- a/youtube_dl/extractor/sztvhu.py
+++ b/youtube_dl/extractor/sztvhu.py
@@ -1,27 +1,24 @@
# -*- coding: utf-8 -*-
-
-import re
+from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import determine_ext
class SztvHuIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
+ _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
_TEST = {
- u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909',
- u'file': u'20130909.mp4',
- u'md5': u'a6df607b11fb07d0e9f2ad94613375cb',
- u'info_dict': {
- u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
- u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
+ 'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909',
+ 'md5': 'a6df607b11fb07d0e9f2ad94613375cb',
+ 'info_dict': {
+ 'id': '20130909',
+ 'ext': 'mp4',
+ 'title': 'Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren',
+ 'description': 'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
},
- u'skip': u'Service temporarily disabled as of 2013-11-20'
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_file = self._search_regex(
r'file: "...:(.*?)",', webpage, 'video file')
@@ -39,7 +36,6 @@ class SztvHuIE(InfoExtractor):
'id': video_id,
'url': video_url,
'title': title,
- 'ext': determine_ext(video_url),
'description': description,
'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
index b87047451..bfe07b024 100644
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@@ -4,10 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import parse_filesize
class TagesschauIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?[0-9]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
@@ -19,6 +20,16 @@ class TagesschauIE(InfoExtractor):
'description': 'md5:69da3c61275b426426d711bde96463ab',
'thumbnail': 're:^http:.*\.jpg$',
},
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
+ 'md5': '3c54c1f6243d279b706bde660ceec633',
+ 'info_dict': {
+ 'id': '5727',
+ 'ext': 'mp4',
+ 'description': 'md5:695c01bfd98b7e313c501386327aea59',
+ 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ }
}]
_FORMATS = {
@@ -28,42 +39,82 @@ class TagesschauIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- if video_id.startswith('-'):
- display_id = video_id.strip('-')
- else:
- display_id = video_id
-
+ video_id = self._match_id(url)
+ display_id = video_id.lstrip('-')
webpage = self._download_webpage(url, display_id)
- playerpage = self._download_webpage(
- 'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id,
- display_id, 'Downloading player page')
-
- medias = re.findall(
- r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
- playerpage)
+ player_url = self._html_search_meta(
+ 'twitter:player', webpage, 'player URL', default=None)
+ if player_url:
+ playerpage = self._download_webpage(
+ player_url, display_id, 'Downloading player page')
- formats = []
- for url, ext, res in medias:
- f = {
- 'format_id': res + '_' + ext,
- 'url': url,
- 'ext': ext,
- }
- f.update(self._FORMATS.get(res, {}))
- formats.append(f)
+ medias = re.findall(
+ r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
+ playerpage)
+ formats = []
+ for url, ext, res in medias:
+ f = {
+ 'format_id': res + '_' + ext,
+ 'url': url,
+ 'ext': ext,
+ }
+ f.update(self._FORMATS.get(res, {}))
+ formats.append(f)
+ thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+ title = self._og_search_title(webpage).strip()
+ description = self._og_search_description(webpage).strip()
+ else:
+ download_text = self._search_regex(
+ r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>',
+ webpage, 'download links')
+ links = re.finditer(
+ r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
+ download_text)
+ formats = []
+ for l in links:
+ format_id = self._search_regex(
+ r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID')
+ format = {
+ 'format_id': format_id,
+ 'url': l.group('url'),
+ 'format_name': l.group('name'),
+ }
+ m = re.match(
+ r'''(?x)
+ Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
+ (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
+ (?P<vbr>[0-9]+)kbps&\#10;
+ Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
+ Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
+ l.group('title'))
+ if m:
+ format.update({
+ 'format_note': m.group('audio_desc'),
+ 'vcodec': m.group('vcodec'),
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ 'abr': int(m.group('abr')),
+ 'vbr': int(m.group('vbr')),
+ 'filesize_approx': parse_filesize(m.group('filesize_approx')),
+ })
+ formats.append(format)
+ thumbnail_fn = self._search_regex(
+ r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+ description = self._html_search_regex(
+ r'(?s)<p class="teasertext">(.*?)</p>',
+ webpage, 'description', fatal=False)
+ title = self._html_search_regex(
+ r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
self._sort_formats(formats)
-
- thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+ thumbnail = 'http://www.tagesschau.de' + thumbnail_fn
return {
'id': display_id,
- 'title': self._og_search_title(webpage).strip(),
- 'thumbnail': 'http://www.tagesschau.de' + thumbnail,
+ 'title': title,
+ 'thumbnail': thumbnail,
'formats': formats,
- 'description': self._og_search_description(webpage).strip(),
+ 'description': description,
}
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
index 77e056242..f1f43d0a7 100644
--- a/youtube_dl/extractor/tapely.py
+++ b/youtube_dl/extractor/tapely.py
@@ -4,10 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_request,
+)
from ..utils import (
- ExtractorError,
clean_html,
- compat_urllib_request,
+ ExtractorError,
float_or_none,
parse_iso8601,
)
@@ -50,6 +52,7 @@ class TapelyIE(InfoExtractor):
request = compat_urllib_request.Request(playlist_url)
request.add_header('X-Requested-With', 'XMLHttpRequest')
request.add_header('Accept', 'application/json')
+ request.add_header('Referer', url)
playlist = self._download_json(request, display_id)
diff --git a/youtube_dl/extractor/tass.py b/youtube_dl/extractor/tass.py
new file mode 100644
index 000000000..c4ef70778
--- /dev/null
+++ b/youtube_dl/extractor/tass.py
@@ -0,0 +1,62 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ qualities,
+)
+
+
+class TassIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:tass\.ru|itar-tass\.com)/[^/]+/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://tass.ru/obschestvo/1586870',
+ 'md5': '3b4cdd011bc59174596b6145cda474a4',
+ 'info_dict': {
+ 'id': '1586870',
+ 'ext': 'mp4',
+ 'title': 'Посетителям московского зоопарка показали красную панду',
+ 'description': 'Приехавшую из Дублина Зейну можно увидеть в павильоне "Кошки тропиков"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ },
+ {
+ 'url': 'http://itar-tass.com/obschestvo/1600009',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ sources = json.loads(js_to_json(self._search_regex(
+ r'(?s)sources\s*:\s*(\[.+?\])', webpage, 'sources')))
+
+ quality = qualities(['sd', 'hd'])
+
+ formats = []
+ for source in sources:
+ video_url = source.get('file')
+ if not video_url or not video_url.startswith('http') or not video_url.endswith('.mp4'):
+ continue
+ label = source.get('label')
+ formats.append({
+ 'url': video_url,
+ 'format_id': label,
+ 'quality': quality(label),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py
index 8a95fd656..82675431f 100644
--- a/youtube_dl/extractor/teachertube.py
+++ b/youtube_dl/extractor/teachertube.py
@@ -57,9 +57,7 @@ class TeacherTubeIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta('title', webpage, 'title', fatal=True)
@@ -121,7 +119,7 @@ class TeacherTubeUserIE(InfoExtractor):
urls = []
webpage = self._download_webpage(url, user_id)
urls.extend(re.findall(self._MEDIA_RE, webpage))
-
+
pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1]
for p in pages:
more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p)
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index fa796ce72..5fa67eb8d 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -8,24 +8,23 @@ from .common import InfoExtractor
class TeamcocoIE(InfoExtractor):
_VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)'
_TESTS = [
- {
- 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',
- 'file': '80187.mp4',
- 'md5': '3f7746aa0dc86de18df7539903d399ea',
- 'info_dict': {
- 'title': 'Conan Becomes A Mary Kay Beauty Consultant',
- 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.'
+ {
+ 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',
+ 'file': '80187.mp4',
+ 'md5': '3f7746aa0dc86de18df7539903d399ea',
+ 'info_dict': {
+ 'title': 'Conan Becomes A Mary Kay Beauty Consultant',
+ 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.'
+ }
+ }, {
+ 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
+ 'file': '19705.mp4',
+ 'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
+ 'info_dict': {
+ "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.",
+ "title": "Louis C.K. Interview Pt. 1 11/3/11"
+ }
}
- },
- {
- 'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
- 'file': '19705.mp4',
- 'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
- 'info_dict': {
- "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.",
- "title": "Louis C.K. Interview Pt. 1 11/3/11"
- }
- }
]
def _real_extract(self, url):
@@ -33,7 +32,7 @@ class TeamcocoIE(InfoExtractor):
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
-
+
video_id = mobj.group("video_id")
if not video_id:
video_id = self._html_search_regex(
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index cd4af96fd..10b3b706a 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -5,7 +5,7 @@ import re
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
+from ..compat import (
compat_str,
)
@@ -13,7 +13,7 @@ from ..utils import (
class TEDIE(SubtitlesInfoExtractor):
_VALID_URL = r'''(?x)
(?P<proto>https?://)
- (?P<type>www|embed)(?P<urlmain>\.ted\.com/
+ (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
(
(?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
|
@@ -33,11 +33,12 @@ class TEDIE(SubtitlesInfoExtractor):
'ext': 'mp4',
'title': 'The illusion of consciousness',
'description': ('Philosopher Dan Dennett makes a compelling '
- 'argument that not only don\'t we understand our own '
- 'consciousness, but that half the time our brains are '
- 'actively fooling us.'),
+ 'argument that not only don\'t we understand our own '
+ 'consciousness, but that half the time our brains are '
+ 'actively fooling us.'),
'uploader': 'Dan Dennett',
'width': 854,
+ 'duration': 1308,
}
}, {
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
@@ -57,6 +58,7 @@ class TEDIE(SubtitlesInfoExtractor):
'title': 'Be passionate. Be courageous. Be your best.',
'uploader': 'Gabby Giffords and Mark Kelly',
'description': 'md5:5174aed4d0f16021b704120360f72b92',
+ 'duration': 1128,
},
}, {
'url': 'http://www.ted.com/playlists/who_are_the_hackers',
@@ -91,12 +93,12 @@ class TEDIE(SubtitlesInfoExtractor):
def _extract_info(self, webpage):
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
- webpage, 'info json')
+ webpage, 'info json')
return json.loads(info_json)
def _real_extract(self, url):
m = re.match(self._VALID_URL, url, re.VERBOSE)
- if m.group('type') == 'embed':
+ if m.group('type').startswith('embed'):
desktop_url = m.group('proto') + 'www' + m.group('urlmain')
return self.url_result(desktop_url, 'TED')
name = m.group('name')
@@ -111,7 +113,7 @@ class TEDIE(SubtitlesInfoExtractor):
'''Returns the videos of the playlist'''
webpage = self._download_webpage(url, name,
- 'Downloading playlist webpage')
+ 'Downloading playlist webpage')
info = self._extract_info(webpage)
playlist_info = info['playlist']
@@ -178,6 +180,7 @@ class TEDIE(SubtitlesInfoExtractor):
'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
+ 'duration': talk_info.get('duration'),
}
def _get_available_subtitles(self, video_id, talk_info):
@@ -196,8 +199,9 @@ class TEDIE(SubtitlesInfoExtractor):
webpage = self._download_webpage(url, name)
config_json = self._html_search_regex(
- r"data-config='([^']+)", webpage, 'config')
- config = json.loads(config_json)
+ r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
+ webpage, 'config')
+ config = json.loads(config_json)['config']
video_url = config['video']['url']
thumbnail = config.get('image', {}).get('url')
diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py
new file mode 100644
index 000000000..a3d05f97d
--- /dev/null
+++ b/youtube_dl/extractor/telebruxelles.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TeleBruxellesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)'
+ _TESTS = [{
+ 'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/',
+ 'md5': '59439e568c9ee42fb77588b2096b214f',
+ 'info_dict': {
+ 'id': '11942',
+ 'display_id': 'auditions-devant-parlement-francken-galant-tres-attendus',
+ 'ext': 'flv',
+ 'title': 'Parlement : Francken et Galant répondent aux interpellations de l’opposition',
+ 'description': 're:Les auditions des ministres se poursuivent*'
+ },
+ 'params': {
+ 'skip_download': 'requires rtmpdump'
+ },
+ }, {
+ 'url': 'http://www.telebruxelles.be/sport/basket-brussels-bat-mons-80-74/',
+ 'md5': '181d3fbdcf20b909309e5aef5c6c6047',
+ 'info_dict': {
+ 'id': '10091',
+ 'display_id': 'basket-brussels-bat-mons-80-74',
+ 'ext': 'flv',
+ 'title': 'Basket : le Brussels bat Mons 80-74',
+ 'description': 're:^Ils l\u2019on fait ! En basket, le B*',
+ },
+ 'params': {
+ 'skip_download': 'requires rtmpdump'
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ article_id = self._html_search_regex(
+ r"<article id=\"post-(\d+)\"", webpage, 'article ID')
+ title = self._html_search_regex(
+ r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title')
+ description = self._og_search_description(webpage)
+
+ rtmp_url = self._html_search_regex(
+ r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"",
+ webpage, 'RTMP url')
+ rtmp_url = rtmp_url.replace("\" + \"", "")
+
+ return {
+ 'id': article_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'url': rtmp_url,
+ 'ext': 'flv',
+ 'rtmp_live': True # if rtmpdump is not called with "--live" argument, the download is blocked and can be completed
+ }
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
new file mode 100644
index 000000000..be3f72df7
--- /dev/null
+++ b/youtube_dl/extractor/telecinco.py
@@ -0,0 +1,19 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .mitele import MiTeleIE
+
+
+class TelecincoIE(MiTeleIE):
+ IE_NAME = 'telecinco.es'
+ _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<id>.*?)\.html'
+
+ _TEST = {
+ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
+ 'info_dict': {
+ 'id': 'MDSVID20141015_0058',
+ 'ext': 'mp4',
+ 'title': 'Con Martín Berasategui, hacer un bacalao al ...',
+ 'duration': 662,
+ },
+ }
diff --git a/youtube_dl/extractor/teletask.py b/youtube_dl/extractor/teletask.py
new file mode 100644
index 000000000..e54145105
--- /dev/null
+++ b/youtube_dl/extractor/teletask.py
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class TeleTaskIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.tele-task.de/archive/video/html5/26168/',
+ 'info_dict': {
+ 'title': 'Duplicate Detection',
+ },
+ 'playlist': [{
+ 'md5': '290ef69fb2792e481169c3958dbfbd57',
+ 'info_dict': {
+ 'id': '26168-speaker',
+ 'ext': 'mp4',
+ 'title': 'Duplicate Detection',
+ 'upload_date': '20141218',
+ }
+ }, {
+ 'md5': 'e1e7218c5f0e4790015a437fcf6c71b4',
+ 'info_dict': {
+ 'id': '26168-slides',
+ 'ext': 'mp4',
+ 'title': 'Duplicate Detection',
+ 'upload_date': '20141218',
+ }
+ }]
+ }
+
+ def _real_extract(self, url):
+ lecture_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, lecture_id)
+
+ title = self._html_search_regex(
+ r'itemprop="name">([^<]+)</a>', webpage, 'title')
+ upload_date = unified_strdate(self._html_search_regex(
+ r'Date:</td><td>([^<]+)</td>', webpage, 'date', fatal=False))
+
+ entries = [{
+ 'id': '%s-%s' % (lecture_id, format_id),
+ 'url': video_url,
+ 'title': title,
+ 'upload_date': upload_date,
+ } for format_id, video_url in re.findall(
+ r'<video class="([^"]+)"[^>]*>\s*<source src="([^"]+)"', webpage)]
+
+ return self.playlist_result(entries, lecture_id, title)
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
index 81ba169fb..466155ef8 100644
--- a/youtube_dl/extractor/tenplay.py
+++ b/youtube_dl/extractor/tenplay.py
@@ -8,7 +8,6 @@ class TenPlayIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ten(play)?\.com\.au/.+'
_TEST = {
'url': 'http://tenplay.com.au/ten-insider/extra/season-2013/tenplay-tv-your-way',
- #'md5': 'd68703d9f73dc8fccf3320ab34202590',
'info_dict': {
'id': '2695695426001',
'ext': 'flv',
diff --git a/youtube_dl/extractor/testtube.py b/youtube_dl/extractor/testtube.py
new file mode 100644
index 000000000..6a7b5e49d
--- /dev/null
+++ b/youtube_dl/extractor/testtube.py
@@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ qualities,
+)
+
+
+class TestTubeIE(InfoExtractor):
+ _VALID_URL = r'https?://testtube\.com/[^/?#]+/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
+ 'info_dict': {
+ 'id': '60163',
+ 'display_id': '5-weird-ways-plants-can-eat-animals',
+ 'duration': 275,
+ 'ext': 'mp4',
+ 'title': '5 Weird Ways Plants Can Eat Animals',
+ 'description': 'Why have some plants evolved to eat meat?',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'DNews',
+ 'uploader_id': 'dnews',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(
+ r"player\.loadRevision3Item\('video_id',\s*([0-9]+)\);",
+ webpage, 'video ID')
+
+ all_info = self._download_json(
+ 'https://testtube.com/api/getPlaylist.json?api_key=ba9c741bce1b9d8e3defcc22193f3651b8867e62&codecs=h264,vp8,theora&video_id=%s' % video_id,
+ video_id)
+ info = all_info['items'][0]
+
+ formats = []
+ for vcodec, fdatas in info['media'].items():
+ for name, fdata in fdatas.items():
+ formats.append({
+ 'format_id': '%s-%s' % (vcodec, name),
+ 'url': fdata['url'],
+ 'vcodec': vcodec,
+ 'tbr': fdata.get('bitrate'),
+ })
+ self._sort_formats(formats)
+
+ duration = int_or_none(info.get('duration'))
+ images = info.get('images')
+ thumbnails = None
+ preference = qualities(['mini', 'small', 'medium', 'large'])
+ if images:
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': img_url,
+ 'preference': preference(thumbnail_id)
+ } for thumbnail_id, img_url in images.items()]
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': info['title'],
+ 'description': info.get('summary'),
+ 'thumbnails': thumbnails,
+ 'uploader': info.get('show', {}).get('name'),
+ 'uploader_id': info.get('show', {}).get('slug'),
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index fdae17b1b..025d0877c 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -1,15 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html'
- _TEST = {
+ _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+ _TESTS = {
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': {
'id': '10635995',
@@ -21,16 +19,28 @@ class TF1IE(InfoExtractor):
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
},
+ }, {
+ 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html',
+ 'info_dict': {
+ 'id': '12043945',
+ 'ext': 'mp4',
+ 'title': 'Le grand Mystérioso - Chuggington',
+ 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.',
+ 'upload_date': '20150103',
+ },
+ 'params': {
+ # Sometimes wat serves the whole file with the --test option
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
embed_url = self._html_search_regex(
- r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url')
+ r'["\'](https?://www.wat.tv/embedframe/.*?)["\']', webpage, 'embed url')
embed_page = self._download_webpage(embed_url, video_id,
- 'Downloading embed player page')
+ 'Downloading embed player page')
wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
wat_info = self._download_json(
'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 0be793b1c..110ed976d 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -3,9 +3,12 @@ from __future__ import unicode_literals
import re
import json
-from .common import InfoExtractor
-from ..utils import (
+from .subtitles import SubtitlesInfoExtractor
+from ..compat import (
compat_str,
+)
+from ..utils import (
+ determine_ext,
ExtractorError,
xpath_with_ns,
)
@@ -13,7 +16,7 @@ from ..utils import (
_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
-class ThePlatformIE(InfoExtractor):
+class ThePlatformIE(SubtitlesInfoExtractor):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
@@ -35,9 +38,20 @@ class ThePlatformIE(InfoExtractor):
},
}
- def _get_info(self, video_id, smil_url):
- meta = self._download_xml(smil_url, video_id)
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ if mobj.group('config'):
+ config_url = url + '&form=json'
+ config_url = config_url.replace('swf/', 'config/')
+ config_url = config_url.replace('onsite/', 'onsite/config/')
+ config = self._download_json(config_url, video_id, 'Downloading config')
+ smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
+ else:
+ smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
+ 'format=smil&mbr=true'.format(video_id))
+ meta = self._download_xml(smil_url, video_id)
try:
error_msg = next(
n.attrib['abstract']
@@ -52,6 +66,20 @@ class ThePlatformIE(InfoExtractor):
info_json = self._download_webpage(info_url, video_id)
info = json.loads(info_json)
+ subtitles = {}
+ captions = info.get('captions')
+ if isinstance(captions, list):
+ for caption in captions:
+ lang, src = caption.get('lang'), caption.get('src')
+ if lang and src:
+ subtitles[lang] = src
+
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, subtitles)
+ return
+
+ subtitles = self.extract_subtitles(video_id, subtitles)
+
head = meta.find(_x('smil:head'))
body = meta.find(_x('smil:body'))
@@ -89,32 +117,23 @@ class ThePlatformIE(InfoExtractor):
for f in switch.findall(_x('smil:video')):
attr = f.attrib
vbr = int(attr['system-bitrate']) // 1000
+ ext = determine_ext(attr['src'])
+ if ext == 'once':
+ ext = 'mp4'
formats.append({
'format_id': compat_str(vbr),
'url': attr['src'],
'vbr': vbr,
+ 'ext': ext,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': info['title'],
+ 'subtitles': subtitles,
'formats': formats,
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
- 'duration': info['duration']//1000,
+ 'duration': info['duration'] // 1000,
}
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- if mobj.group('config'):
- config_url = url+ '&form=json'
- config_url = config_url.replace('swf/', 'config/')
- config_url = config_url.replace('onsite/', 'onsite/config/')
- config = self._download_json(config_url, video_id, 'Downloading config')
- smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
- else:
- smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
- 'format=smil&mbr=true'.format(video_id))
- return self._get_info(video_id, smil_url)
diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py
index bfb9d2fc9..7f323c938 100644
--- a/youtube_dl/extractor/thisav.py
+++ b/youtube_dl/extractor/thisav.py
@@ -1,4 +1,4 @@
-#coding: utf-8
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -36,12 +36,12 @@ class ThisAVIE(InfoExtractor):
r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
webpage, 'uploader id', fatal=False)
ext = determine_ext(video_url)
-
+
return {
- 'id': video_id,
- 'url': video_url,
- 'uploader': uploader,
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': uploader,
'uploader_id': uploader_id,
- 'title': title,
- 'ext': ext,
+ 'title': title,
+ 'ext': ext,
}
diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py
index a4aa25f66..e036b8cdf 100644
--- a/youtube_dl/extractor/tinypic.py
+++ b/youtube_dl/extractor/tinypic.py
@@ -9,26 +9,32 @@ from ..utils import ExtractorError
class TinyPicIE(InfoExtractor):
IE_NAME = 'tinypic'
IE_DESC = 'tinypic.com videos'
- _VALID_URL = r'http://tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
-
- _TEST = {
- 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
- 'md5': '609b74432465364e72727ebc6203f044',
- 'info_dict': {
- 'id': '6xw7tc',
- 'ext': 'flv',
- 'title': 'shadow phenomenon weird',
+ _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
+
+ _TESTS = [
+ {
+ 'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
+ 'md5': '609b74432465364e72727ebc6203f044',
+ 'info_dict': {
+ 'id': '6xw7tc',
+ 'ext': 'flv',
+ 'title': 'shadow phenomenon weird',
+ },
+ },
+ {
+ 'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8',
+ 'only_matching': True,
}
- }
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id, 'Downloading page')
-
+
mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n'
- '\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage)
+ '\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage)
if mobj is None:
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
@@ -47,4 +53,4 @@ class TinyPicIE(InfoExtractor):
'url': video_url,
'thumbnail': thumbnail,
'title': title
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index d848ee186..9f9e388c5 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -5,7 +5,7 @@ import re
from .common import InfoExtractor
from .brightcove import BrightcoveIE
from .discovery import DiscoveryIE
-from ..utils import compat_urlparse
+from ..compat import compat_urlparse
class TlcIE(DiscoveryIE):
@@ -36,9 +36,10 @@ class TlcDeIE(InfoExtractor):
'ext': 'mp4',
'title': 'Breaking Amish: Die Welt da draußen',
'uploader': 'Discovery Networks - Germany',
- 'description': 'Vier Amische und eine Mennonitin wagen in New York'
+ 'description': (
+ 'Vier Amische und eine Mennonitin wagen in New York'
' den Sprung in ein komplett anderes Leben. Begleitet sie auf'
- ' ihrem spannenden Weg.',
+ ' ihrem spannenden Weg.'),
},
}
diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py
new file mode 100644
index 000000000..c5c6fdc51
--- /dev/null
+++ b/youtube_dl/extractor/tmz.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TMZIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/]+)/?'
+ _TEST = {
+ 'url': 'http://www.tmz.com/videos/0_okj015ty/',
+ 'md5': '791204e3bf790b1426cb2db0706184c0',
+ 'info_dict': {
+ 'id': '0_okj015ty',
+ 'url': 'http://tmz.vo.llnwd.net/o28/2014-03/13/0_okj015ty_0_rt8ro3si_2.mp4',
+ 'ext': 'mp4',
+ 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!',
+ 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?',
+ 'thumbnail': r're:http://cdnbakmi\.kaltura\.com/.*thumbnail.*',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ return {
+ 'id': video_id,
+ 'url': self._html_search_meta('VideoURL', webpage, fatal=True),
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._html_search_meta('ThumbURL', webpage),
+ }
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index 4956f8577..d48cbbf14 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -12,7 +12,7 @@ from ..utils import (
class TNAFlixIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
- _TITLE_REGEX = None
+ _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
_DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
_CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
@@ -49,8 +49,8 @@ class TNAFlixIE(InfoExtractor):
if duration:
duration = parse_duration(duration[1:])
- cfg_url = self._html_search_regex(
- self._CONFIG_REGEX, webpage, 'flashvars.config')
+ cfg_url = self._proto_relative_url(self._html_search_regex(
+ self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
cfg_xml = self._download_xml(
cfg_url, display_id, note='Downloading metadata',
@@ -71,7 +71,7 @@ class TNAFlixIE(InfoExtractor):
fmt['height'] = int(m.group(1))
formats.append(fmt)
self._sort_formats(formats)
-
+
return {
'id': video_id,
'display_id': display_id,
diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py
index 11407428b..1c53a3fd0 100644
--- a/youtube_dl/extractor/traileraddict.py
+++ b/youtube_dl/extractor/traileraddict.py
@@ -25,7 +25,7 @@ class TrailerAddictIE(InfoExtractor):
webpage = self._download_webpage(url, name)
title = self._search_regex(r'<title>(.+?)</title>',
- webpage, 'video title').replace(' - Trailer Addict','')
+ webpage, 'video title').replace(' - Trailer Addict', '')
view_count_str = self._search_regex(
r'<span class="views_n">([0-9,.]+)</span>',
webpage, 'view count', fatal=False)
@@ -43,12 +43,12 @@ class TrailerAddictIE(InfoExtractor):
fvar = "fvar"
info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id))
- info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage")
+ info_webpage = self._download_webpage(info_url, video_id, "Downloading the info webpage")
final_url = self._search_regex(r'&fileurl=(.+)',
- info_webpage, 'Download url').replace('%3F','?')
+ info_webpage, 'Download url').replace('%3F', '?')
thumbnail_url = self._search_regex(r'&image=(.+?)&',
- info_webpage, 'thumbnail url')
+ info_webpage, 'thumbnail url')
description = self._html_search_regex(
r'(?s)<div class="synopsis">.*?<div class="movie_label_info"[^>]*>(.*?)</div>',
diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py
index d64aaa41f..220a05b7b 100644
--- a/youtube_dl/extractor/trilulilu.py
+++ b/youtube_dl/extractor/trilulilu.py
@@ -1,28 +1,28 @@
+from __future__ import unicode_literals
+
import json
-import re
from .common import InfoExtractor
class TriluliluIE(InfoExtractor):
- _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?trilulilu\.ro/video-(?P<category>[^/]+)/(?P<video_id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?trilulilu\.ro/video-[^/]+/(?P<id>[^/]+)'
_TEST = {
- u"url": u"http://www.trilulilu.ro/video-animatie/big-buck-bunny-1",
- u'file': u"big-buck-bunny-1.mp4",
- u'info_dict': {
- u"title": u"Big Buck Bunny",
- u"description": u":) pentru copilul din noi",
+ 'url': 'http://www.trilulilu.ro/video-animatie/big-buck-bunny-1',
+ 'info_dict': {
+ 'id': 'big-buck-bunny-1',
+ 'ext': 'mp4',
+ 'title': 'Big Buck Bunny',
+ 'description': ':) pentru copilul din noi',
},
# Server ignores Range headers (--test)
- u"params": {
- u"skip_download": True
+ 'params': {
+ 'skip_download': True
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
@@ -30,20 +30,20 @@ class TriluliluIE(InfoExtractor):
description = self._og_search_description(webpage)
log_str = self._search_regex(
- r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, u'log info')
+ r'block_flash_vars[ ]=[ ]({[^}]+})', webpage, 'log info')
log = json.loads(log_str)
- format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
- u'video-formats2' % log)
+ format_url = ('http://fs%(server)s.trilulilu.ro/%(hash)s/'
+ 'video-formats2' % log)
format_doc = self._download_xml(
format_url, video_id,
- note=u'Downloading formats',
- errnote=u'Error while downloading formats')
-
+ note='Downloading formats',
+ errnote='Error while downloading formats')
+
video_url_template = (
- u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
- u'&source=site&hash=%(hash)s&username=%(userid)s&'
- u'key=ministhebest&format=%%s&sig=&exp=' %
+ 'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
+ '&source=site&hash=%(hash)s&username=%(userid)s&'
+ 'key=ministhebest&format=%%s&sig=&exp=' %
log)
formats = [
{
@@ -63,4 +63,3 @@ class TriluliluIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
}
-
diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py
index 57f956683..e7b79243a 100644
--- a/youtube_dl/extractor/trutube.py
+++ b/youtube_dl/extractor/trutube.py
@@ -1,13 +1,12 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import xpath_text
class TruTubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P<id>[0-9]+)'
+ _TESTS = [{
'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-',
'md5': 'c5b6e301b0a2040b074746cbeaa26ca1',
'info_dict': {
@@ -16,29 +15,26 @@ class TruTubeIE(InfoExtractor):
'title': 'Ramses II - Proven To Be A Red Headed Caucasoid',
'thumbnail': 're:^http:.*\.jpg$',
}
- }
+ }, {
+ 'url': 'https://trutube.tv/nuevo/player/embed.php?v=14880',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_title = self._og_search_title(webpage).strip()
- thumbnail = self._search_regex(
- r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False)
+ config = self._download_xml(
+ 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id,
+ video_id, transform_source=lambda s: s.strip())
- all_formats = re.finditer(
- r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage)
- formats = [{
- 'format_id': m.group('key'),
- 'quality': -i,
- 'url': m.group('url'),
- } for i, m in enumerate(all_formats)]
- self._sort_formats(formats)
+ # filehd is always 404
+ video_url = xpath_text(config, './file', 'video URL', fatal=True)
+ title = xpath_text(config, './title', 'title').strip()
+ thumbnail = xpath_text(config, './image', ' thumbnail')
return {
'id': video_id,
- 'title': video_title,
- 'formats': formats,
+ 'url': video_url,
+ 'title': title,
'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index 64a1e9030..d73ad3762 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -4,9 +4,11 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
+)
+from ..utils import (
int_or_none,
str_to_int,
)
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index dcd823d08..c89de5ba4 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -9,7 +9,7 @@ from .common import InfoExtractor
class TudouIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
+ _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
_TESTS = [{
'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
'md5': '140a49ed444bd22f93330985d8475fcb',
@@ -27,26 +27,18 @@ class TudouIE(InfoExtractor):
'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
'thumbnail': 're:^https?://.*\.jpg$',
}
- }, {
- 'url': 'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html',
- 'info_dict': {
- 'title': 'todo.mp4',
- },
- 'add_ie': ['Youku'],
- 'skip': 'Only works from China'
}]
- def _url_for_id(self, id, quality = None):
- info_url = "http://v2.tudou.com/f?id="+str(id)
+ def _url_for_id(self, id, quality=None):
+ info_url = "http://v2.tudou.com/f?id=" + str(id)
if quality:
info_url += '&hd' + quality
webpage = self._download_webpage(info_url, id, "Opening the info webpage")
- final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
+ final_url = self._html_search_regex('>(.+?)</f>', webpage, 'video url')
return final_url
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(2)
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage)
@@ -73,7 +65,7 @@ class TudouIE(InfoExtractor):
result = []
len_parts = len(parts)
if len_parts > 1:
- self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
+ self.to_screen('%s: found %s parts' % (video_id, len_parts))
for part in parts:
part_id = part['k']
final_url = self._url_for_id(part_id, quality)
@@ -87,4 +79,9 @@ class TudouIE(InfoExtractor):
}
result.append(part_info)
- return result
+ return {
+ '_type': 'multi_video',
+ 'entries': result,
+ 'id': video_id,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 306fe8974..2a1ae5a71 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -4,9 +4,6 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
-)
class TumblrIE(InfoExtractor):
@@ -18,7 +15,7 @@ class TumblrIE(InfoExtractor):
'id': '54196191430',
'ext': 'mp4',
'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...',
- 'description': 'md5:dfac39636969fe6bf1caa2d50405f069',
+ 'description': 'md5:37db8211e40b50c7c44e95da14f630b7',
'thumbnail': 're:http://.*\.jpg',
}
}, {
@@ -27,7 +24,7 @@ class TumblrIE(InfoExtractor):
'info_dict': {
'id': '90208453769',
'ext': 'mp4',
- 'title': '5SOS STRUM ;)',
+ 'title': '5SOS STRUM ;]',
'description': 'md5:dba62ac8639482759c8eb10ce474586a',
'thumbnail': 're:http://.*\.jpg',
}
@@ -41,18 +38,12 @@ class TumblrIE(InfoExtractor):
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
webpage = self._download_webpage(url, video_id)
- re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
- video = re.search(re_video, webpage)
- if video is None:
- raise ExtractorError('Unable to extract video')
- video_url = video.group('video_url')
- ext = video.group('ext')
-
- video_thumbnail = self._search_regex(
- r'posters.*?\[\\x22(.*?)\\x22',
- webpage, 'thumbnail', fatal=False) # We pick the first poster
- if video_thumbnail:
- video_thumbnail = video_thumbnail.replace('\\\\/', '/')
+ iframe_url = self._search_regex(
+ r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
+ webpage, 'iframe url')
+ iframe = self._download_webpage(iframe_url, video_id)
+ video_url = self._search_regex(r'<source src="([^"]+)"',
+ iframe, 'video url')
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
@@ -62,9 +53,9 @@ class TumblrIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'description': self._html_search_meta('description', webpage),
- 'thumbnail': video_thumbnail,
- 'ext': ext,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': video_title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py
new file mode 100644
index 000000000..b6b1f2568
--- /dev/null
+++ b/youtube_dl/extractor/tunein.py
@@ -0,0 +1,106 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class TuneInIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://(?:www\.)?
+ (?:
+ tunein\.com/
+ (?:
+ radio/.*?-s|
+ station/.*?StationId\=
+ )(?P<id>[0-9]+)
+ |tun\.in/(?P<redirect_id>[A-Za-z0-9]+)
+ )
+ '''
+ _API_URL_TEMPLATE = 'http://tunein.com/tuner/tune/?stationId={0:}&tuneType=Station'
+
+ _INFO_DICT = {
+ 'id': '34682',
+ 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2',
+ 'ext': 'aac',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'location': 'Tacoma, WA',
+ }
+ _TESTS = [
+ {
+ 'url': 'http://tunein.com/radio/Jazz24-885-s34682/',
+ 'info_dict': _INFO_DICT,
+ 'params': {
+ 'skip_download': True, # live stream
+ },
+ },
+ { # test redirection
+ 'url': 'http://tun.in/ser7s',
+ 'info_dict': _INFO_DICT,
+ 'params': {
+ 'skip_download': True, # live stream
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ redirect_id = mobj.group('redirect_id')
+ if redirect_id:
+ # The server doesn't support HEAD requests
+ urlh = self._request_webpage(
+ url, redirect_id, note='Downloading redirect page')
+ url = urlh.geturl()
+ self.to_screen('Following redirect: %s' % url)
+ mobj = re.match(self._VALID_URL, url)
+ station_id = mobj.group('id')
+
+ station_info = self._download_json(
+ self._API_URL_TEMPLATE.format(station_id),
+ station_id, note='Downloading station JSON')
+
+ title = station_info['Title']
+ thumbnail = station_info.get('Logo')
+ location = station_info.get('Location')
+ streams_url = station_info.get('StreamUrl')
+ if not streams_url:
+ raise ExtractorError('No downloadable streams found',
+ expected=True)
+ stream_data = self._download_webpage(
+ streams_url, station_id, note='Downloading stream data')
+ streams = json.loads(self._search_regex(
+ r'\((.*)\);', stream_data, 'stream info'))['Streams']
+
+ is_live = None
+ formats = []
+ for stream in streams:
+ if stream.get('Type') == 'Live':
+ is_live = True
+ reliability = stream.get('Reliability')
+ format_note = (
+ 'Reliability: %d%%' % reliability
+ if reliability is not None else None)
+ formats.append({
+ 'preference': (
+ 0 if reliability is None or reliability > 90
+ else 1),
+ 'abr': stream.get('Bandwidth'),
+ 'ext': stream.get('MediaType').lower(),
+ 'acodec': stream.get('MediaType'),
+ 'vcodec': 'none',
+ 'url': stream.get('Url'),
+ 'source_preference': reliability,
+ 'format_note': format_note,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': station_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'location': location,
+ 'is_live': is_live,
+ }
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index d516b6427..4de0aac52 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -1,10 +1,9 @@
from __future__ import unicode_literals
import base64
-import re
from .common import InfoExtractor
-from ..utils import compat_parse_qs
+from ..compat import compat_parse_qs
class TutvIE(InfoExtractor):
@@ -20,10 +19,9 @@ class TutvIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID')
data_content = self._download_webpage(
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index 27962b5fe..ba65996dc 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -1,32 +1,30 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
float_or_none,
- str_to_int,
+ parse_age_limit,
)
class TvigleIE(InfoExtractor):
IE_NAME = 'tvigle'
IE_DESC = 'Интернет-телевидение Tvigle.ru'
- _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$'
+ _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<id>[^/]+)/$'
_TESTS = [
{
- 'url': 'http://www.tvigle.ru/video/brat/',
- 'md5': 'ff4344a4894b0524441fb6f8218dc716',
+ 'url': 'http://www.tvigle.ru/video/sokrat/',
+ 'md5': '36514aed3657d4f70b4b2cef8eb520cd',
'info_dict': {
- 'id': '5118490',
- 'display_id': 'brat',
- 'ext': 'mp4',
- 'title': 'Брат',
- 'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb',
- 'duration': 5722.6,
- 'age_limit': 16,
+ 'id': '1848932',
+ 'display_id': 'sokrat',
+ 'ext': 'flv',
+ 'title': 'Сократ',
+ 'description': 'md5:a05bd01be310074d5833efc6743be95e',
+ 'duration': 6586,
+ 'age_limit': 0,
},
},
{
@@ -44,8 +42,7 @@ class TvigleIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
+ display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
@@ -60,8 +57,8 @@ class TvigleIE(InfoExtractor):
title = item['title']
description = item['description']
thumbnail = item['thumbnail']
- duration = float_or_none(item['durationMilliseconds'], 1000)
- age_limit = str_to_int(item['ageRestrictions'])
+ duration = float_or_none(item.get('durationMilliseconds'), 1000)
+ age_limit = parse_age_limit(item.get('ageRestrictions'))
formats = []
for vcodec, fmts in item['videos'].items():
@@ -84,4 +81,4 @@ class TvigleIE(InfoExtractor):
'duration': duration,
'age_limit': age_limit,
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
index bfed9dd04..f57d609d4 100644
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -1,42 +1,139 @@
-import json
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
class TvpIE(InfoExtractor):
- IE_NAME = u'tvp.pl'
- _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P<date>\d+)/(?P<id>\d+)'
-
- _TEST = {
- u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238',
- u'md5': u'148408967a6a468953c0a75cbdaf0d7a',
- u'file': u'12878238.wmv',
- u'info_dict': {
- u'title': u'31.10.2013 - Odcinek 2',
- u'description': u'31.10.2013 - Odcinek 2',
+ IE_NAME = 'tvp.pl'
+ _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$'
+
+ _TESTS = [{
+ 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
+ 'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
+ 'info_dict': {
+ 'id': '4278035',
+ 'ext': 'wmv',
+ 'title': 'Ogniem i mieczem, odc. 2',
+ },
+ }, {
+ 'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
+ 'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
+ 'info_dict': {
+ 'id': '194536',
+ 'ext': 'mp4',
+ 'title': 'Czas honoru, I seria – odc. 13',
},
- u'skip': u'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.'
- }
+ }, {
+ 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
+ 'md5': 'c3b15ed1af288131115ff17a17c19dda',
+ 'info_dict': {
+ 'id': '17916176',
+ 'ext': 'mp4',
+ 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
+ },
+ }, {
+ 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
+ 'md5': 'c3b15ed1af288131115ff17a17c19dda',
+ 'info_dict': {
+ 'id': '17834272',
+ 'ext': 'mp4',
+ 'title': 'Na sygnale, odc. 39',
+ },
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
- json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id
- json_params = self._download_webpage(
- json_url, video_id, u"Downloading video metadata")
-
- params = json.loads(json_params)
- self.report_extraction(video_id)
- video_url = params['video_url']
-
- title = self._og_search_title(webpage, fatal=True)
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
+
+ title = self._search_regex(
+ r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
+ webpage, 'title', group='title')
+ series_title = self._search_regex(
+ r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
+ webpage, 'series', group='series', default=None)
+ if series_title:
+ title = '%s, %s' % (series_title, title)
+
+ thumbnail = self._search_regex(
+ r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
+
+ video_url = self._search_regex(
+ r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
+ if not video_url:
+ video_url = self._download_json(
+ 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
+ video_id)['video_url']
+
+ ext = video_url.rsplit('.', 1)[-1]
+ if ext != 'ism/manifest':
+ if '/' in ext:
+ ext = 'mp4'
+ formats = [{
+ 'format_id': 'direct',
+ 'url': video_url,
+ 'ext': ext,
+ }]
+ else:
+ m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url)
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+
+ self._sort_formats(formats)
+
return {
'id': video_id,
'title': title,
- 'ext': 'wmv',
- 'url': video_url,
- 'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
+
+
+class TvpSeriesIE(InfoExtractor):
+ IE_NAME = 'tvp.pl:Series'
+ _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
+
+ _TESTS = [{
+ 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem',
+ 'info_dict': {
+ 'title': 'Ogniem i mieczem',
+ 'id': '4278026',
+ },
+ 'playlist_count': 4,
+ }, {
+ 'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat',
+ 'info_dict': {
+ 'title': 'Boso przez świat',
+ 'id': '9329207',
+ },
+ 'playlist_count': 86,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id, tries=5)
+
+ title = self._html_search_regex(
+ r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series')
+ playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id')
+ playlist = self._download_webpage(
+ 'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend'
+ 'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5,
+ note='Downloading playlist')
+
+ videos_paths = re.findall(
+ '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
+ entries = [
+ self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key())
+ for v_path in videos_paths]
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'entries': entries,
}
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index 445e0ec41..9a53a3c74 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -4,9 +4,8 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- ExtractorError,
- compat_str,
parse_iso8601,
qualities,
)
@@ -176,15 +175,14 @@ class TVPlayIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
video = self._download_json(
'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON')
if video['is_geo_blocked']:
- raise ExtractorError(
- 'This content is not available in your country due to copyright reasons', expected=True)
+ self.report_warning(
+ 'This content might not be available in your country due to copyright reasons')
streams = self._download_json(
'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON')
@@ -208,6 +206,10 @@ class TVPlayIE(InfoExtractor):
'app': m.group('app'),
'play_path': m.group('playpath'),
})
+ elif video_url.endswith('.f4m'):
+ formats.extend(self._extract_f4m_formats(
+ video_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id))
+ continue
else:
fmt.update({
'url': video_url,
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py
new file mode 100644
index 000000000..67e8bfea0
--- /dev/null
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ int_or_none,
+)
+
+
+class TwentyFourVideoIE(InfoExtractor):
+ IE_NAME = '24video'
+ _VALID_URL = r'https?://(?:www\.)?24video\.net/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.24video.net/video/view/1044982',
+ 'md5': '48dd7646775690a80447a8dca6a2df76',
+ 'info_dict': {
+ 'id': '1044982',
+ 'ext': 'mp4',
+ 'title': 'Эротика каменного века',
+ 'description': 'Как смотрели порно в каменном веке.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'SUPERTELO',
+ 'duration': 31,
+ 'timestamp': 1275937857,
+ 'upload_date': '20100607',
+ 'age_limit': 18,
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ },
+ {
+ 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982',
+ 'only_matching': True,
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.24video.net/video/view/%s' % video_id, video_id)
+
+ title = self._og_search_title(webpage)
+ description = self._html_search_regex(
+ r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._og_search_property(
+ 'duration', webpage, 'duration', fatal=False))
+ timestamp = parse_iso8601(self._search_regex(
+ r'<time id="video-timeago" datetime="([^"]+)" itemprop="uploadDate">',
+ webpage, 'upload date'))
+
+ uploader = self._html_search_regex(
+ r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+ webpage, 'uploader', fatal=False)
+
+ view_count = int_or_none(self._html_search_regex(
+ r'<span class="video-views">(\d+) просмотр',
+ webpage, 'view count', fatal=False))
+ comment_count = int_or_none(self._html_search_regex(
+ r'<div class="comments-title" id="comments-count">(\d+) комментари',
+ webpage, 'comment count', fatal=False))
+
+ formats = []
+
+ pc_video = self._download_xml(
+ 'http://www.24video.net/video/xml/%s?mode=play' % video_id,
+ video_id, 'Downloading PC video URL').find('.//video')
+
+ formats.append({
+ 'url': pc_video.attrib['url'],
+ 'format_id': 'pc',
+ 'quality': 1,
+ })
+
+ like_count = int_or_none(pc_video.get('ratingPlus'))
+ dislike_count = int_or_none(pc_video.get('ratingMinus'))
+ age_limit = 18 if pc_video.get('adult') == 'true' else 0
+
+ mobile_video = self._download_xml(
+ 'http://www.24video.net/video/xml/%s' % video_id,
+ video_id, 'Downloading mobile video URL').find('.//video')
+
+ formats.append({
+ 'url': mobile_video.attrib['url'],
+ 'format_id': 'mobile',
+ 'quality': 0,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 36aa1ad6e..87290d002 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -1,51 +1,28 @@
+# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
+import random
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
parse_iso8601,
)
-class TwitchIE(InfoExtractor):
- # TODO: One broadcast may be split into multiple videos. The key
- # 'broadcast_id' is the same for all parts, and 'broadcast_part'
- # starts at 1 and increases. Can we treat all parts as one video?
- _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?twitch\.tv/
- (?:
- (?P<channelid>[^/]+)|
- (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
- (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
- )
- /?(?:\#.*)?$
- """
- _PAGE_LIMIT = 100
+class TwitchBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
+
_API_BASE = 'https://api.twitch.tv'
- _TESTS = [{
- 'url': 'http://www.twitch.tv/riotgames/b/577357806',
- 'info_dict': {
- 'id': 'a577357806',
- 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
- },
- 'playlist_mincount': 12,
- }, {
- 'url': 'http://www.twitch.tv/acracingleague/c/5285812',
- 'info_dict': {
- 'id': 'c5285812',
- 'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
- },
- 'playlist_mincount': 3,
- }, {
- 'url': 'http://www.twitch.tv/vanillatv',
- 'info_dict': {
- 'id': 'vanillatv',
- 'title': 'VanillaTV',
- },
- 'playlist_mincount': 412,
- }]
+ _USHER_BASE = 'http://usher.twitch.tv'
+ _LOGIN_URL = 'https://secure.twitch.tv/user/login'
def _handle_error(self, response):
if not isinstance(response, dict):
@@ -57,21 +34,60 @@ class TwitchIE(InfoExtractor):
expected=True)
def _download_json(self, url, video_id, note='Downloading JSON metadata'):
- response = super(TwitchIE, self)._download_json(url, video_id, note)
+ response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
self._handle_error(response)
return response
- def _extract_media(self, item, item_id):
- ITEMS = {
- 'a': 'video',
- 'c': 'chapter',
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ authenticity_token = self._search_regex(
+ r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
+ login_page, 'authenticity token')
+
+ login_form = {
+ 'utf8': '✓'.encode('utf-8'),
+ 'authenticity_token': authenticity_token,
+ 'redirect_on_login': '',
+ 'embed_form': 'false',
+ 'mp_source_action': '',
+ 'follow': '',
+ 'user[login]': username,
+ 'user[password]': password,
}
- info = self._extract_info(self._download_json(
+
+ request = compat_urllib_request.Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ request.add_header('Referer', self._LOGIN_URL)
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ m = re.search(
+ r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
+ if m:
+ raise ExtractorError(
+ 'Unable to login: %s' % m.group('msg').strip(), expected=True)
+
+
+class TwitchItemBaseIE(TwitchBaseIE):
+ def _download_info(self, item, item_id):
+ return self._extract_info(self._download_json(
'%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
- 'Downloading %s info JSON' % ITEMS[item]))
+ 'Downloading %s info JSON' % self._ITEM_TYPE))
+
+ def _extract_media(self, item_id):
+ info = self._download_info(self._ITEM_SHORTCUT, item_id)
response = self._download_json(
- '%s/api/videos/%s%s' % (self._API_BASE, item, item_id), item_id,
- 'Downloading %s playlist JSON' % ITEMS[item])
+ '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id,
+ 'Downloading %s playlist JSON' % self._ITEM_TYPE)
entries = []
chunks = response['chunks']
qualities = list(chunks.keys())
@@ -110,78 +126,257 @@ class TwitchIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj.group('chapterid'):
- return self._extract_media('c', mobj.group('chapterid'))
+ return self._extract_media(self._match_id(url))
+
+
+class TwitchVideoIE(TwitchItemBaseIE):
+ IE_NAME = 'twitch:video'
+ _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+ _ITEM_TYPE = 'video'
+ _ITEM_SHORTCUT = 'a'
+
+ _TEST = {
+ 'url': 'http://www.twitch.tv/riotgames/b/577357806',
+ 'info_dict': {
+ 'id': 'a577357806',
+ 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
+ },
+ 'playlist_mincount': 12,
+ }
+
+
+class TwitchChapterIE(TwitchItemBaseIE):
+ IE_NAME = 'twitch:chapter'
+ _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+ _ITEM_TYPE = 'chapter'
+ _ITEM_SHORTCUT = 'c'
+
+ _TESTS = [{
+ 'url': 'http://www.twitch.tv/acracingleague/c/5285812',
+ 'info_dict': {
+ 'id': 'c5285812',
+ 'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
+ },
+ 'playlist_mincount': 3,
+ }, {
+ 'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
+ 'only_matching': True,
+ }]
+
+
+class TwitchVodIE(TwitchItemBaseIE):
+ IE_NAME = 'twitch:vod'
+ _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+ _ITEM_TYPE = 'vod'
+ _ITEM_SHORTCUT = 'v'
+
+ _TEST = {
+ 'url': 'http://www.twitch.tv/ksptv/v/3622000',
+ 'info_dict': {
+ 'id': 'v3622000',
+ 'ext': 'mp4',
+ 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 6951,
+ 'timestamp': 1419028564,
+ 'upload_date': '20141219',
+ 'uploader': 'KSPTV',
+ 'uploader_id': 'ksptv',
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ info = self._download_info(self._ITEM_SHORTCUT, item_id)
+ access_token = self._download_json(
+ '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
+ 'Downloading %s access token' % self._ITEM_TYPE)
+ formats = self._extract_m3u8_formats(
+ '%s/vod/%s?nauth=%s&nauthsig=%s'
+ % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
+ item_id, 'mp4')
+ info['formats'] = formats
+ return info
+
+
+class TwitchPlaylistBaseIE(TwitchBaseIE):
+ _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
+ _PAGE_LIMIT = 100
+
+ def _extract_playlist(self, channel_id):
+ info = self._download_json(
+ '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
+ channel_id, 'Downloading channel info JSON')
+ channel_name = info.get('display_name') or info.get('name')
+ entries = []
+ offset = 0
+ limit = self._PAGE_LIMIT
+ for counter in itertools.count(1):
+ response = self._download_json(
+ self._PLAYLIST_URL % (channel_id, offset, limit),
+ channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
+ page_entries = self._extract_playlist_page(response)
+ if not page_entries:
+ break
+ entries.extend(page_entries)
+ offset += limit
+ return self.playlist_result(
+ [self.url_result(entry) for entry in set(entries)],
+ channel_id, channel_name)
+
+ def _extract_playlist_page(self, response):
+ videos = response.get('videos')
+ return [video['url'] for video in videos] if videos else []
+
+ def _real_extract(self, url):
+ return self._extract_playlist(self._match_id(url))
+
+
+class TwitchProfileIE(TwitchPlaylistBaseIE):
+ IE_NAME = 'twitch:profile'
+ _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+ _PLAYLIST_TYPE = 'profile'
+
+ _TEST = {
+ 'url': 'http://www.twitch.tv/vanillatv/profile',
+ 'info_dict': {
+ 'id': 'vanillatv',
+ 'title': 'VanillaTV',
+ },
+ 'playlist_mincount': 412,
+ }
+
+
+class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
+ IE_NAME = 'twitch:past_broadcasts'
+ _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+ _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true'
+ _PLAYLIST_TYPE = 'past broadcasts'
+
+ _TEST = {
+ 'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts',
+ 'info_dict': {
+ 'id': 'spamfish',
+ 'title': 'Spamfish',
+ },
+ 'playlist_mincount': 54,
+ }
+
+
+class TwitchBookmarksIE(TwitchPlaylistBaseIE):
+ IE_NAME = 'twitch:bookmarks'
+ _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+ _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
+ _PLAYLIST_TYPE = 'bookmarks'
+
+ _TEST = {
+ 'url': 'http://www.twitch.tv/ognos/profile/bookmarks',
+ 'info_dict': {
+ 'id': 'ognos',
+ 'title': 'Ognos',
+ },
+ 'playlist_mincount': 3,
+ }
+
+ def _extract_playlist_page(self, response):
+ entries = []
+ for bookmark in response.get('bookmarks', []):
+ video = bookmark.get('video')
+ if not video:
+ continue
+ entries.append(video['url'])
+ return entries
+
+
+class TwitchStreamIE(TwitchBaseIE):
+ IE_NAME = 'twitch:stream'
+ _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+
+ _TEST = {
+ 'url': 'http://www.twitch.tv/shroomztv',
+ 'info_dict': {
+ 'id': '12772022048',
+ 'display_id': 'shroomztv',
+ 'ext': 'mp4',
+ 'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV',
+ 'is_live': True,
+ 'timestamp': 1421928037,
+ 'upload_date': '20150122',
+ 'uploader': 'ShroomzTV',
+ 'uploader_id': 'shroomztv',
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ stream = self._download_json(
+ '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id,
+ 'Downloading stream JSON').get('stream')
+
+ # Fallback on profile extraction if stream is offline
+ if not stream:
+ return self.url_result(
+ 'http://www.twitch.tv/%s/profile' % channel_id,
+ 'TwitchProfile', channel_id)
+
+ access_token = self._download_json(
+ '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
+ 'Downloading channel access token')
+
+ query = {
+ 'allow_source': 'true',
+ 'p': random.randint(1000000, 10000000),
+ 'player': 'twitchweb',
+ 'segment_preference': '4',
+ 'sig': access_token['sig'],
+ 'token': access_token['token'],
+ }
+
+ formats = self._extract_m3u8_formats(
+ '%s/api/channel/hls/%s.m3u8?%s'
+ % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')),
+ channel_id, 'mp4')
+
+ view_count = stream.get('viewers')
+ timestamp = parse_iso8601(stream.get('created_at'))
+
+ channel = stream['channel']
+ title = self._live_title(channel.get('display_name') or channel.get('name'))
+ description = channel.get('status')
- """
- webpage = self._download_webpage(url, chapter_id)
- m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
+ thumbnails = []
+ for thumbnail_key, thumbnail_url in stream['preview'].items():
+ m = re.search(r'(?P<width>\d+)x(?P<height>\d+)\.jpg$', thumbnail_key)
if not m:
- raise ExtractorError('Cannot find archive of a chapter')
- archive_id = m.group(1)
-
- api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
- doc = self._download_xml(
- api, chapter_id,
- note='Downloading chapter information',
- errnote='Chapter information download failed')
- for a in doc.findall('.//archive'):
- if archive_id == a.find('./id').text:
- break
- else:
- raise ExtractorError('Could not find chapter in chapter information')
-
- video_url = a.find('./video_file_url').text
- video_ext = video_url.rpartition('.')[2] or 'flv'
-
- chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
- chapter_info = self._download_json(
- chapter_api_url, 'c' + chapter_id,
- note='Downloading chapter metadata',
- errnote='Download of chapter metadata failed')
-
- bracket_start = int(doc.find('.//bracket_start').text)
- bracket_end = int(doc.find('.//bracket_end').text)
-
- # TODO determine start (and probably fix up file)
- # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
- #video_url += '?start=' + TODO:start_timestamp
- # bracket_start is 13290, but we want 51670615
- self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
- 'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
-
- info = {
- 'id': 'c' + chapter_id,
- 'url': video_url,
- 'ext': video_ext,
- 'title': chapter_info['title'],
- 'thumbnail': chapter_info['preview'],
- 'description': chapter_info['description'],
- 'uploader': chapter_info['channel']['display_name'],
- 'uploader_id': chapter_info['channel']['name'],
- }
- return info
- """
- elif mobj.group('videoid'):
- return self._extract_media('a', mobj.group('videoid'))
- elif mobj.group('channelid'):
- channel_id = mobj.group('channelid')
- info = self._download_json(
- '%s/kraken/channels/%s' % (self._API_BASE, channel_id),
- channel_id, 'Downloading channel info JSON')
- channel_name = info.get('display_name') or info.get('name')
- entries = []
- offset = 0
- limit = self._PAGE_LIMIT
- for counter in itertools.count(1):
- response = self._download_json(
- '%s/kraken/channels/%s/videos/?offset=%d&limit=%d'
- % (self._API_BASE, channel_id, offset, limit),
- channel_id, 'Downloading channel videos JSON page %d' % counter)
- videos = response['videos']
- if not videos:
- break
- entries.extend([self.url_result(video['url'], 'Twitch') for video in videos])
- offset += limit
- return self.playlist_result(entries, channel_id, channel_name)
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+
+ return {
+ 'id': compat_str(stream['_id']),
+ 'display_id': channel_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'uploader': channel.get('display_name'),
+ 'uploader_id': channel.get('name'),
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'is_live': True,
+ }
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 054f42725..4667ed83b 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
@@ -40,8 +42,24 @@ class UdemyIE(InfoExtractor):
error_str += ' - %s' % error_data.get('formErrors')
raise ExtractorError(error_str, expected=True)
- def _download_json(self, url, video_id, note='Downloading JSON metadata'):
- response = super(UdemyIE, self)._download_json(url, video_id, note)
+ def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'):
+ headers = {
+ 'X-Udemy-Snail-Case': 'true',
+ 'X-Requested-With': 'XMLHttpRequest',
+ }
+ for cookie in self._downloader.cookiejar:
+ if cookie.name == 'client_id':
+ headers['X-Udemy-Client-Id'] = cookie.value
+ elif cookie.name == 'access_token':
+ headers['X-Udemy-Bearer-Token'] = cookie.value
+
+ if isinstance(url_or_request, compat_urllib_request.Request):
+ for header, value in headers.items():
+ url_or_request.add_header(header, value)
+ else:
+ url_or_request = compat_urllib_request.Request(url_or_request, headers=headers)
+
+ response = super(UdemyIE, self)._download_json(url_or_request, video_id, note)
self._handle_error(response)
return response
@@ -62,7 +80,9 @@ class UdemyIE(InfoExtractor):
if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>':
return
- csrf = self._html_search_regex(r'<input type="hidden" name="csrf" value="(.+?)"', login_popup, 'csrf token')
+ csrf = self._html_search_regex(
+ r'<input type="hidden" name="csrf" value="(.+?)"',
+ login_popup, 'csrf token')
login_form = {
'email': username,
@@ -71,42 +91,49 @@ class UdemyIE(InfoExtractor):
'displayType': 'json',
'isSubmitted': '1',
}
- request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
- response = self._download_json(request, None, 'Logging in as %s' % username)
+ request = compat_urllib_request.Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ response = self._download_json(
+ request, None, 'Logging in as %s' % username)
if 'returnUrl' not in response:
raise ExtractorError('Unable to log in')
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- lecture_id = mobj.group('id')
+ lecture_id = self._match_id(url)
lecture = self._download_json(
- 'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id, lecture_id, 'Downloading lecture JSON')
+ 'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id,
+ lecture_id, 'Downloading lecture JSON')
- if lecture['assetType'] != 'Video':
- raise ExtractorError('Lecture %s is not a video' % lecture_id, expected=True)
+ asset_type = lecture.get('assetType') or lecture.get('asset_type')
+ if asset_type != 'Video':
+ raise ExtractorError(
+ 'Lecture %s is not a video' % lecture_id, expected=True)
asset = lecture['asset']
- stream_url = asset['streamUrl']
+ stream_url = asset.get('streamUrl') or asset.get('stream_url')
mobj = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url)
if mobj:
return self.url_result(mobj.group(1), 'Youtube')
video_id = asset['id']
- thumbnail = asset['thumbnailUrl']
+ thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url')
duration = asset['data']['duration']
- download_url = asset['downloadUrl']
+ download_url = asset.get('downloadUrl') or asset.get('download_url')
+
+ video = download_url.get('Video') or download_url.get('video')
+ video_480p = download_url.get('Video480p') or download_url.get('video_480p')
formats = [
{
- 'url': download_url['Video480p'][0],
+ 'url': video_480p[0],
'format_id': '360p',
},
{
- 'url': download_url['Video'][0],
+ 'url': video[0],
'format_id': '720p',
},
]
@@ -140,25 +167,29 @@ class UdemyCourseIE(UdemyIE):
course_path = mobj.group('coursepath')
response = self._download_json(
- 'https://www.udemy.com/api-1.1/courses/%s' % course_path, course_path, 'Downloading course JSON')
+ 'https://www.udemy.com/api-1.1/courses/%s' % course_path,
+ course_path, 'Downloading course JSON')
course_id = int(response['id'])
course_title = response['title']
webpage = self._download_webpage(
- 'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id, course_id, 'Enrolling in the course')
+ 'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id,
+ course_id, 'Enrolling in the course')
if self._SUCCESSFULLY_ENROLLED in webpage:
self.to_screen('%s: Successfully enrolled in' % course_id)
elif self._ALREADY_ENROLLED in webpage:
self.to_screen('%s: Already enrolled in' % course_id)
- response = self._download_json('https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id,
+ response = self._download_json(
+ 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id,
course_id, 'Downloading course curriculum')
entries = [
- self.url_result('https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'Udemy')
- for asset in response if asset.get('assetType') == 'Video'
+ self.url_result(
+ 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'Udemy')
+ for asset in response if asset.get('assetType') or asset.get('asset_type') == 'Video'
]
- return self.playlist_result(entries, course_id, course_title) \ No newline at end of file
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py
index 5d06fcc9e..8872cfcb2 100644
--- a/youtube_dl/extractor/urort.py
+++ b/youtube_dl/extractor/urort.py
@@ -1,11 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+)
+from ..utils import (
unified_strdate,
)
@@ -18,11 +18,10 @@ class UrortIE(InfoExtractor):
'url': 'https://urort.p3.no/#!/Band/Gerilja',
'md5': '5ed31a924be8a05e47812678a86e127b',
'info_dict': {
- 'id': '33124-4',
+ 'id': '33124-24',
'ext': 'mp3',
'title': 'The Bomb',
'thumbnail': 're:^https?://.+\.jpg',
- 'like_count': int,
'uploader': 'Gerilja',
'uploader_id': 'Gerilja',
'upload_date': '20100323',
@@ -33,25 +32,31 @@ class UrortIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
- json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr
+ json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr
songs = self._download_json(json_url, playlist_id)
- print(songs[0])
-
- entries = [{
- 'id': '%d-%s' % (s['BandId'], s['$id']),
- 'title': s['Title'],
- 'url': s['TrackUrl'],
- 'ext': 'mp3',
- 'uploader_id': playlist_id,
- 'uploader': s.get('BandName', playlist_id),
- 'like_count': s.get('LikeCount'),
- 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
- 'upload_date': unified_strdate(s.get('Released')),
- } for s in songs]
+ entries = []
+ for s in songs:
+ formats = [{
+ 'tbr': f.get('Quality'),
+ 'ext': f['FileType'],
+ 'format_id': '%s-%s' % (f['FileType'], f.get('Quality', '')),
+ 'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'],
+ 'preference': 3 if f['FileType'] == 'mp3' else 2,
+ } for f in s['Files']]
+ self._sort_formats(formats)
+ e = {
+ 'id': '%d-%s' % (s['BandId'], s['$id']),
+ 'title': s['Title'],
+ 'uploader_id': playlist_id,
+ 'uploader': s.get('BandName', playlist_id),
+ 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
+ 'upload_date': unified_strdate(s.get('Released')),
+ 'formats': formats,
+ }
+ entries.append(e)
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index 994b60a76..68d03b999 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -3,9 +3,8 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
- get_meta_content,
)
@@ -46,13 +45,13 @@ class UstreamIE(InfoExtractor):
self.report_extraction(video_id)
video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
- webpage, 'title')
+ webpage, 'title')
uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
- webpage, 'uploader', fatal=False, flags=re.DOTALL)
+ webpage, 'uploader', fatal=False, flags=re.DOTALL)
thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
- webpage, 'thumbnail', fatal=False)
+ webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
@@ -72,14 +71,14 @@ class UstreamChannelIE(InfoExtractor):
'info_dict': {
'id': '10874166',
},
- 'playlist_mincount': 54,
+ 'playlist_mincount': 17,
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
display_id = m.group('slug')
webpage = self._download_webpage(url, display_id)
- channel_id = get_meta_content('ustream:channel_id', webpage)
+ channel_id = self._html_search_meta('ustream:channel_id', webpage)
BASE = 'http://www.ustream.tv'
next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index ebd64f0f5..dd026748d 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -1,19 +1,18 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
)
class Vbox7IE(InfoExtractor):
- _VALID_URL = r'http://(www\.)?vbox7\.com/play:(?P<id>[^/]+)'
+ _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)'
_TEST = {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
@@ -25,18 +24,17 @@ class Vbox7IE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
redirect_page, urlh = self._download_webpage_handle(url, video_id)
new_location = self._search_regex(r'window\.location = \'(.*)\';',
- redirect_page, 'redirect location')
+ redirect_page, 'redirect location')
redirect_url = urlh.geturl() + new_location
webpage = self._download_webpage(redirect_url, video_id,
- 'Downloading redirect page')
+ 'Downloading redirect page')
title = self._html_search_regex(r'<title>(.*)</title>',
- webpage, 'title').split('/')[0].strip()
+ webpage, 'title').split('/')[0].strip()
info_url = "http://vbox7.com/play/magare.do"
data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id})
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index 77b1f91ce..96353f525 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -4,10 +4,13 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urlparse,
- get_element_by_id,
+)
+from ..utils import (
+ ExtractorError,
clean_html,
+ get_element_by_id,
)
@@ -15,24 +18,27 @@ class VeeHDIE(InfoExtractor):
_VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://veehd.com/video/4686958',
+ 'url': 'http://veehd.com/video/4639434_Solar-Sinter',
'info_dict': {
- 'id': '4686958',
+ 'id': '4639434',
'ext': 'mp4',
- 'title': 'Time Lapse View from Space ( ISS)',
- 'uploader_id': 'spotted',
- 'description': 'md5:f0094c4cf3a72e22bc4e4239ef767ad7',
+ 'title': 'Solar Sinter',
+ 'uploader_id': 'VideoEyes',
+ 'description': 'md5:46a840e8692ddbaffb5f81d9885cb457',
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
# VeeHD seems to send garbage on the first request.
# See https://github.com/rg3/youtube-dl/issues/2102
self._download_webpage(url, video_id, 'Requesting webpage')
webpage = self._download_webpage(url, video_id)
+
+ if 'This video has been removed<' in webpage:
+ raise ExtractorError('Video %s has been removed' % video_id, expected=True)
+
player_path = self._search_regex(
r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
webpage, 'player path')
@@ -41,17 +47,34 @@ class VeeHDIE(InfoExtractor):
self._download_webpage(player_url, video_id, 'Requesting player page')
player_page = self._download_webpage(
player_url, video_id, 'Downloading player page')
+
config_json = self._search_regex(
- r'value=\'config=({.+?})\'', player_page, 'config json')
- config = json.loads(config_json)
+ r'value=\'config=({.+?})\'', player_page, 'config json', default=None)
+
+ if config_json:
+ config = json.loads(config_json)
+ video_url = compat_urlparse.unquote(config['clip']['url'])
+ else:
+ iframe_src = self._search_regex(
+ r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url')
+ iframe_url = 'http://veehd.com/%s' % iframe_src
+
+ self._download_webpage(iframe_url, video_id, 'Requesting iframe page')
+ iframe_page = self._download_webpage(
+ iframe_url, video_id, 'Downloading iframe page')
+
+ video_url = self._search_regex(
+ r"file\s*:\s*'([^']+)'", iframe_page, 'video url')
- video_url = compat_urlparse.unquote(config['clip']['url'])
title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])
- uploader_id = self._html_search_regex(r'<a href="/profile/\d+">(.+?)</a>',
+ uploader_id = self._html_search_regex(
+ r'<a href="/profile/\d+">(.+?)</a>',
webpage, 'uploader')
- thumbnail = self._search_regex(r'<img id="veehdpreview" src="(.+?)"',
+ thumbnail = self._search_regex(
+ r'<img id="veehdpreview" src="(.+?)"',
webpage, 'thumbnail')
- description = self._html_search_regex(r'<td class="infodropdown".*?<div>(.*?)<ul',
+ description = self._html_search_regex(
+ r'<td class="infodropdown".*?<div>(.*?)<ul',
webpage, 'description', flags=re.DOTALL)
return {
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index a7953a7e7..01e258e32 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -4,8 +4,10 @@ import re
import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
int_or_none,
ExtractorError,
)
diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py
index 27f9acb67..a0c59a2e0 100644
--- a/youtube_dl/extractor/vesti.py
+++ b/youtube_dl/extractor/vesti.py
@@ -112,10 +112,10 @@ class VestiIE(InfoExtractor):
if mobj:
video_id = mobj.group('id')
page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
- 'Downloading video page')
+ 'Downloading video page')
rutv_url = RUTVIE._extract_url(page)
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
- raise ExtractorError('No video found', expected=True) \ No newline at end of file
+ raise ExtractorError('No video found', expected=True)
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 5b1a3ec78..43f6b029d 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -4,8 +4,10 @@ import re
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+)
+from ..utils import (
ExtractorError,
)
@@ -13,7 +15,7 @@ from ..utils import (
class VevoIE(InfoExtractor):
"""
Accepts urls from vevo.com or in the format 'vevo:{id}'
- (currently used by MTVIE)
+ (currently used by MTVIE and MySpaceIE)
"""
_VALID_URL = r'''(?x)
(?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?|
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index 964470070..2f111bf7e 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -17,7 +17,7 @@ class VGTVIE(InfoExtractor):
'info_dict': {
'id': '84196',
'ext': 'mp4',
- 'title': 'Hevnen er søt episode 10: Abu',
+ 'title': 'Hevnen er søt: Episode 10 - Abu',
'description': 'md5:e25e4badb5f544b04341e14abdc72234',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 648.000,
@@ -35,7 +35,7 @@ class VGTVIE(InfoExtractor):
'title': 'OPPTAK: VGTV følger EM-kvalifiseringen',
'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3',
'thumbnail': 're:^https?://.*\.jpg',
- 'duration': 9056.000,
+ 'duration': 9103.0,
'timestamp': 1410113864,
'upload_date': '20140907',
'view_count': int,
@@ -67,9 +67,7 @@ class VGTVIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
data = self._download_json(
'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id,
video_id, 'Downloading media JSON')
@@ -116,4 +114,4 @@ class VGTVIE(InfoExtractor):
'duration': float_or_none(data['duration'], 1000),
'view_count': data['displays'],
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py
index 2f77e3898..6be3774b7 100644
--- a/youtube_dl/extractor/vh1.py
+++ b/youtube_dl/extractor/vh1.py
@@ -121,4 +121,7 @@ class VH1IE(MTVIE):
idoc = self._download_xml(
doc_url, video_id,
'Downloading info', transform_source=fix_xml_ampersands)
- return [self._get_video_info(item) for item in idoc.findall('.//item')]
+ return self.playlist_result(
+ [self._get_video_info(item) for item in idoc.findall('.//item')],
+ playlist_id=video_id,
+ )
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
new file mode 100644
index 000000000..71f520fb5
--- /dev/null
+++ b/youtube_dl/extractor/vice.py
@@ -0,0 +1,37 @@
+from __future__ import unicode_literals
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import ExtractorError
+
+
+class ViceIE(InfoExtractor):
+ _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+
+ _TEST = {
+ 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'mp4',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ },
+ 'params': {
+ # Requires ffmpeg (m3u8 manifest)
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ webpage = self._download_webpage(url, name)
+ try:
+ embed_code = self._search_regex(
+ r'embedCode=([^&\'"]+)', webpage,
+ 'ooyala embed code')
+ ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
+ print(ooyala_url)
+ except ExtractorError:
+ raise ExtractorError('The page doesn\'t contain a video', expected=True)
+ return self.url_result(ooyala_url, ie='Ooyala')
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py
index 9328ef4a2..0faa729c6 100644
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@@ -1,55 +1,85 @@
-import json
-import re
+from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ int_or_none,
+)
class ViddlerIE(InfoExtractor):
- _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)'
_TEST = {
- u"url": u"http://www.viddler.com/v/43903784",
- u'file': u'43903784.mp4',
- u'md5': u'fbbaedf7813e514eb7ca30410f439ac9',
- u'info_dict': {
- u"title": u"Video Made Easy",
- u"uploader": u"viddler",
- u"duration": 100.89,
+ "url": "http://www.viddler.com/v/43903784",
+ 'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4',
+ 'info_dict': {
+ 'id': '43903784',
+ 'ext': 'mp4',
+ "title": "Video Made Easy",
+ 'description': 'You don\'t need to be a professional to make high-quality video content. Viddler provides some quick and easy tips on how to produce great video content with limited resources. ',
+ "uploader": "viddler",
+ 'timestamp': 1335371429,
+ 'upload_date': '20120425',
+ "duration": 100.89,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'view_count': int,
+ 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'],
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- embed_url = mobj.group('domain') + u'/embed/' + video_id
- webpage = self._download_webpage(embed_url, video_id)
-
- video_sources_code = self._search_regex(
- r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs')
- video_sources = json.loads(video_sources_code.replace("'", '"'))
-
- formats = [{
- 'url': video_url,
- 'format': format_id,
- } for video_url, format_id in video_sources.items()]
-
- title = self._html_search_regex(
- r"title\s*:\s*'([^']*)'", webpage, u'title')
- uploader = self._html_search_regex(
- r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False)
- duration_s = self._html_search_regex(
- r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False)
- duration = float(duration_s) if duration_s else None
- thumbnail = self._html_search_regex(
- r"thumbnail\s*:\s*'([^']*)'",
- webpage, u'thumbnail', fatal=False)
+ video_id = self._match_id(url)
+
+ json_url = (
+ 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' %
+ video_id)
+ data = self._download_json(json_url, video_id)['video']
+
+ formats = []
+ for filed in data['files']:
+ if filed.get('status', 'ready') != 'ready':
+ continue
+ f = {
+ 'format_id': filed['profile_id'],
+ 'format_note': filed['profile_name'],
+ 'url': self._proto_relative_url(filed['url']),
+ 'width': int_or_none(filed.get('width')),
+ 'height': int_or_none(filed.get('height')),
+ 'filesize': int_or_none(filed.get('size')),
+ 'ext': filed.get('ext'),
+ 'source_preference': -1,
+ }
+ formats.append(f)
+
+ if filed.get('cdn_url'):
+ f = f.copy()
+ f['url'] = self._proto_relative_url(filed['cdn_url'])
+ f['format_id'] = filed['profile_id'] + '-cdn'
+ f['source_preference'] = 1
+ formats.append(f)
+
+ if filed.get('html5_video_source'):
+ f = f.copy()
+ f['url'] = self._proto_relative_url(
+ filed['html5_video_source'])
+ f['format_id'] = filed['profile_id'] + '-html5'
+ f['source_preference'] = 0
+ formats.append(f)
+ self._sort_formats(formats)
+
+ categories = [
+ t.get('text') for t in data.get('tags', []) if 'text' in t]
return {
'_type': 'video',
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'duration': duration,
+ 'title': data['title'],
'formats': formats,
+ 'description': data.get('description'),
+ 'timestamp': int_or_none(data.get('upload_time')),
+ 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')),
+ 'uploader': data.get('author'),
+ 'duration': float_or_none(data.get('length')),
+ 'view_count': int_or_none(data.get('view_count')),
+ 'categories': categories,
}
diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py
index fed95ef71..0eb3d9414 100644
--- a/youtube_dl/extractor/videobam.py
+++ b/youtube_dl/extractor/videobam.py
@@ -78,4 +78,4 @@ class VideoBamIE(InfoExtractor):
'view_count': view_count,
'formats': formats,
'age_limit': 18,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
index ac6c25537..0ffc7ff7d 100644
--- a/youtube_dl/extractor/videodetective.py
+++ b/youtube_dl/extractor/videodetective.py
@@ -1,10 +1,8 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_urlparse
from .internetvideoarchive import InternetVideoArchiveIE
-from ..utils import compat_urlparse
class VideoDetectiveIE(InfoExtractor):
@@ -17,13 +15,12 @@ class VideoDetectiveIE(InfoExtractor):
'ext': 'mp4',
'title': 'KICK-ASS 2',
'description': 'md5:65ba37ad619165afac7d432eaded6013',
- 'duration': 135,
+ 'duration': 138,
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage)
query = compat_urlparse.urlparse(og_video).query
diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py
index f75169041..94f9e9be9 100644
--- a/youtube_dl/extractor/videofyme.py
+++ b/youtube_dl/extractor/videofyme.py
@@ -1,46 +1,50 @@
-import re
+from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
- determine_ext,
+ int_or_none,
)
+
class VideofyMeIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)'
- IE_NAME = u'videofy.me'
+ _VALID_URL = r'https?://(?:www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)'
+ IE_NAME = 'videofy.me'
_TEST = {
- u'url': u'http://www.videofy.me/thisisvideofyme/1100701',
- u'file': u'1100701.mp4',
- u'md5': u'c77d700bdc16ae2e9f3c26019bd96143',
- u'info_dict': {
- u'title': u'This is VideofyMe',
- u'description': None,
- u'uploader': u'VideofyMe',
- u'uploader_id': u'thisisvideofyme',
+ 'url': 'http://www.videofy.me/thisisvideofyme/1100701',
+ 'md5': 'c77d700bdc16ae2e9f3c26019bd96143',
+ 'info_dict': {
+ 'id': '1100701',
+ 'ext': 'mp4',
+ 'title': 'This is VideofyMe',
+ 'description': None,
+ 'uploader': 'VideofyMe',
+ 'uploader_id': 'thisisvideofyme',
+ 'view_count': int,
},
-
+
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id,
- video_id)
+ video_id)
video = config.find('video')
sources = video.find('sources')
- url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key)
- for key in ['on', 'av', 'off']] if node is not None)
+ url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key)
+ for key in ['on', 'av', 'off']] if node is not None)
video_url = url_node.find('url').text
+ view_count = int_or_none(self._search_regex(
+ r'([0-9]+)', video.find('views').text, 'view count', fatal=False))
- return {'id': video_id,
- 'title': video.find('title').text,
- 'url': video_url,
- 'ext': determine_ext(video_url),
- 'thumbnail': video.find('thumb').text,
- 'description': video.find('description').text,
- 'uploader': config.find('blog/name').text,
- 'uploader_id': video.find('identifier').text,
- 'view_count': re.search(r'\d+', video.find('views').text).group(),
- }
+ return {
+ 'id': video_id,
+ 'title': video.find('title').text,
+ 'url': video_url,
+ 'thumbnail': video.find('thumb').text,
+ 'description': video.find('description').text,
+ 'uploader': config.find('blog/name').text,
+ 'uploader_id': video.find('identifier').text,
+ 'view_count': view_count,
+ }
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 29c4e0101..273030316 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -4,8 +4,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
+ compat_urllib_request,
+)
+from ..utils import (
+ ExtractorError,
remove_start,
)
@@ -16,36 +20,40 @@ class VideoMegaIE(InfoExtractor):
(?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
'''
_TEST = {
- 'url': 'http://videomega.tv/?ref=GKeGPVedBe',
- 'md5': '240fb5bcf9199961f48eb17839b084d6',
+ 'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ',
+ 'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
'info_dict': {
- 'id': 'GKeGPVedBe',
+ 'id': 'QR0HCUHI1661IHUCH0RQ',
'ext': 'mp4',
- 'title': 'XXL - All Sports United',
+ 'title': 'Big Buck Bunny',
'thumbnail': 're:^https?://.*\.jpg$',
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+
+ iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
+ req = compat_urllib_request.Request(iframe_url)
+ req.add_header('Referer', url)
+ webpage = self._download_webpage(req, video_id)
- url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
- webpage = self._download_webpage(url, video_id)
+ try:
+ escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1]
+ except IndexError:
+ raise ExtractorError('Unable to extract escaped data')
- escaped_data = self._search_regex(
- r'unescape\("([^"]+)"\)', webpage, 'escaped data')
playlist = compat_urllib_parse.unquote(escaped_data)
thumbnail = self._search_regex(
r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False)
- url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
+ video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
title = remove_start(self._html_search_regex(
r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ')
formats = [{
'format_id': 'sd',
- 'url': url,
+ 'url': video_url,
}]
self._sort_formats(formats)
@@ -54,4 +62,7 @@ class VideoMegaIE(InfoExtractor):
'title': title,
'formats': formats,
'thumbnail': thumbnail,
+ 'http_headers': {
+ 'Referer': iframe_url,
+ },
}
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py
index 65463c733..3176e3b9d 100644
--- a/youtube_dl/extractor/videopremium.py
+++ b/youtube_dl/extractor/videopremium.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
import random
@@ -5,23 +7,22 @@ from .common import InfoExtractor
class VideoPremiumIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?'
+ _VALID_URL = r'https?://(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?'
_TEST = {
- u'url': u'http://videopremium.tv/4w7oadjsf156',
- u'file': u'4w7oadjsf156.f4v',
- u'info_dict': {
- u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4"
+ 'url': 'http://videopremium.tv/4w7oadjsf156',
+ 'info_dict': {
+ 'id': '4w7oadjsf156',
+ 'ext': 'f4v',
+ 'title': 'youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4'
},
- u'params': {
- u'skip_download': True,
+ 'params': {
+ 'skip_download': True,
},
- u'skip': u'Test file has been deleted.',
+ 'skip': 'Test file has been deleted.',
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage_url = 'http://videopremium.tv/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
@@ -29,17 +30,17 @@ class VideoPremiumIE(InfoExtractor):
# Download again, we need a cookie
webpage = self._download_webpage(
webpage_url, video_id,
- note=u'Downloading webpage again (with cookie)')
+ note='Downloading webpage again (with cookie)')
video_title = self._html_search_regex(
- r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, u'video title')
+ r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, 'video title')
return {
- 'id': video_id,
- 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
- 'play_path': "mp4:%s.f4v" % video_id,
- 'page_url': "http://videopremium.tv/" + video_id,
- 'player_url': "http://videopremium.tv/uplayer/uppod.swf",
- 'ext': 'f4v',
- 'title': video_title,
+ 'id': video_id,
+ 'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
+ 'play_path': "mp4:%s.f4v" % video_id,
+ 'page_url': "http://videopremium.tv/" + video_id,
+ 'player_url': "http://videopremium.tv/uplayer/uppod.swf",
+ 'ext': 'f4v',
+ 'title': video_title,
}
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
index a647807d0..ececc7ee0 100644
--- a/youtube_dl/extractor/videott.py
+++ b/youtube_dl/extractor/videott.py
@@ -13,9 +13,9 @@ from ..utils import (
class VideoTtIE(InfoExtractor):
ID_NAME = 'video.tt'
IE_DESC = 'video.tt - Your True Tube'
- _VALID_URL = r'http://(?:www\.)?video\.tt/(?:video/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})'
+ _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8',
'md5': 'b13aa9e2f267effb5d1094443dff65ba',
'info_dict': {
@@ -26,7 +26,10 @@ class VideoTtIE(InfoExtractor):
'upload_date': '20130827',
'uploader': 'joseph313',
}
- }
+ }, {
+ 'url': 'http://video.tt/embed/amd5YujV8',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -58,4 +61,4 @@ class VideoTtIE(InfoExtractor):
'like_count': int_or_none(video['liked']),
'dislike_count': int_or_none(video['disliked']),
'formats': formats,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/videoweed.py b/youtube_dl/extractor/videoweed.py
index 4a08ddd43..ca2e50935 100644
--- a/youtube_dl/extractor/videoweed.py
+++ b/youtube_dl/extractor/videoweed.py
@@ -23,4 +23,4 @@ class VideoWeedIE(NovaMovIE):
'title': 'optical illusion dissapeared image magic illusion',
'description': ''
},
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
new file mode 100644
index 000000000..08a5a7b8d
--- /dev/null
+++ b/youtube_dl/extractor/vidzi.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class VidziIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)'
+ _TEST = {
+ 'url': 'http://vidzi.tv/cghql9yq6emu.html',
+ 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660',
+ 'info_dict': {
+ 'id': 'cghql9yq6emu',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ video_url = self._html_search_regex(
+ r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url')
+ title = self._html_search_regex(
+ r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ }
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
new file mode 100644
index 000000000..619039e51
--- /dev/null
+++ b/youtube_dl/extractor/vier.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class VierIE(InfoExtractor):
+ IE_NAME = 'vier'
+ _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+ _TESTS = [{
+ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
+ 'info_dict': {
+ 'id': '16129',
+ 'display_id': 'het-wordt-warm-de-moestuin',
+ 'ext': 'mp4',
+ 'title': 'Het wordt warm in De Moestuin',
+ 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.vier.be/video/v3/embed/16129',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ embed_id = mobj.group('embed_id')
+ display_id = mobj.group('display_id') or embed_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r'"nid"\s*:\s*"(\d+)"', webpage, 'video id')
+ application = self._search_regex(
+ r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod')
+ filename = self._search_regex(
+ r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename')
+
+ playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
+ formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
+
+ title = self._og_search_title(webpage, default=display_id)
+ description = self._og_search_description(webpage, default=None)
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
+
+
+class VierVideosIE(InfoExtractor):
+ IE_NAME = 'vier:videos'
+ _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
+ _TESTS = [{
+ 'url': 'http://www.vier.be/demoestuin/videos',
+ 'info_dict': {
+ 'id': 'demoestuin',
+ },
+ 'playlist_mincount': 153,
+ }, {
+ 'url': 'http://www.vier.be/demoestuin/videos?page=6',
+ 'info_dict': {
+ 'id': 'demoestuin-page6',
+ },
+ 'playlist_mincount': 20,
+ }, {
+ 'url': 'http://www.vier.be/demoestuin/videos?page=7',
+ 'info_dict': {
+ 'id': 'demoestuin-page7',
+ },
+ 'playlist_mincount': 13,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ program = mobj.group('program')
+
+ webpage = self._download_webpage(url, program)
+
+ page_id = mobj.group('page')
+ if page_id:
+ page_id = int(page_id)
+ start_page = page_id
+ last_page = start_page + 1
+ playlist_id = '%s-page%d' % (program, page_id)
+ else:
+ start_page = 0
+ last_page = int(self._search_regex(
+ r'videos\?page=(\d+)">laatste</a>',
+ webpage, 'last page', default=0)) + 1
+ playlist_id = program
+
+ entries = []
+ for current_page_id in range(start_page, last_page):
+ current_page = self._download_webpage(
+ 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id),
+ program,
+ 'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage
+ page_entries = [
+ self.url_result('http://www.vier.be' + video_url, 'Vier')
+ for video_url in re.findall(
+ r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
+ entries.extend(page_entries)
+
+ return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 15f315298..944901e14 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -17,7 +17,6 @@ class VikiIE(SubtitlesInfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
_TEST = {
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
- 'md5': 'a21454021c2646f5433514177e2caa5f',
'info_dict': {
'id': '1023585v',
'ext': 'mp4',
@@ -31,8 +30,7 @@ class VikiIE(SubtitlesInfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 07959d3fe..06b0bed41 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -7,14 +7,14 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
-from ..utils import (
- clean_html,
+from ..compat import (
compat_HTTPError,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
+)
+from ..utils import (
ExtractorError,
- get_element_by_attribute,
InAdvancePagedList,
int_or_none,
RegexNotFoundError,
@@ -260,7 +260,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
else:
config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
config = self._search_regex(config_re, webpage, 'info section',
- flags=re.DOTALL)
+ flags=re.DOTALL)
config = json.loads(config)
except Exception as e:
if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
@@ -514,7 +514,7 @@ class VimeoReviewIE(InfoExtractor):
'info_dict': {
'id': '91613211',
'ext': 'mp4',
- 'title': 'Death by dogma versus assembling agile - Sander Hoogendoorn',
+ 'title': 're:(?i)^Death by dogma versus assembling agile . Sander Hoogendoorn',
'uploader': 'DevWeek Events',
'duration': 2773,
'thumbnail': 're:^https?://.*\.jpg$',
diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py
index 33d370e1c..ee3d86117 100644
--- a/youtube_dl/extractor/vimple.py
+++ b/youtube_dl/extractor/vimple.py
@@ -14,28 +14,17 @@ class VimpleIE(InfoExtractor):
IE_DESC = 'Vimple.ru'
_VALID_URL = r'https?://(player.vimple.ru/iframe|vimple.ru)/(?P<id>[a-f0-9]{10,})'
_TESTS = [
- # Quality: Large, from iframe
{
- 'url': 'http://player.vimple.ru/iframe/b132bdfd71b546d3972f9ab9a25f201c',
+ 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf',
+ 'md5': '2e750a330ed211d3fd41821c6ad9a279',
'info_dict': {
- 'id': 'b132bdfd71b546d3972f9ab9a25f201c',
- 'title': 'great-escape-minecraft.flv',
+ 'id': 'c0f6b1687dcd4000a97ebe70068039cf',
'ext': 'mp4',
- 'duration': 352,
- 'webpage_url': 'http://vimple.ru/b132bdfd71b546d3972f9ab9a25f201c',
+ 'title': 'Sunset',
+ 'duration': 20,
+ 'thumbnail': 're:https?://.*?\.jpg',
},
},
- # Quality: Medium, from mainpage
- {
- 'url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd',
- 'info_dict': {
- 'id': 'a15950562888453b8e6f9572dc8600cd',
- 'title': 'DB 01',
- 'ext': 'flv',
- 'duration': 1484,
- 'webpage_url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd',
- }
- },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index e7754158d..0b58fe0fe 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -17,6 +17,7 @@ class VineIE(InfoExtractor):
'id': 'b9KOOWX7HUx',
'ext': 'mp4',
'title': 'Chicken.',
+ 'alt_title': 'Vine by Jack Dorsey',
'description': 'Chicken.',
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
@@ -25,30 +26,26 @@ class VineIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
data = json.loads(self._html_search_regex(
r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
- formats = [
- {
- 'url': data['videoLowURL'],
- 'ext': 'mp4',
- 'format_id': 'low',
- },
- {
- 'url': data['videoUrl'],
- 'ext': 'mp4',
- 'format_id': 'standard',
- }
- ]
+ formats = [{
+ 'url': data['videoLowURL'],
+ 'ext': 'mp4',
+ 'format_id': 'low',
+ }, {
+ 'url': data['videoUrl'],
+ 'ext': 'mp4',
+ 'format_id': 'standard',
+ }]
return {
'id': video_id,
'title': self._og_search_title(webpage),
+ 'alt_title': self._og_search_description(webpage),
'description': data['description'],
'thumbnail': data['thumbnailUrl'],
'upload_date': unified_strdate(data['created']),
@@ -63,29 +60,36 @@ class VineIE(InfoExtractor):
class VineUserIE(InfoExtractor):
IE_NAME = 'vine:user'
- _VALID_URL = r'(?:https?://)?vine\.co/(?P<user>[^/]+)/?(\?.*)?$'
+ _VALID_URL = r'(?:https?://)?vine\.co/(?P<u>u/)?(?P<user>[^/]+)/?(\?.*)?$'
_VINE_BASE_URL = "https://vine.co/"
- _TEST = {
- 'url': 'https://vine.co/Visa',
- 'info_dict': {
- 'id': 'Visa',
+ _TESTS = [
+ {
+ 'url': 'https://vine.co/Visa',
+ 'info_dict': {
+ 'id': 'Visa',
+ },
+ 'playlist_mincount': 46,
},
- 'playlist_mincount': 47,
- }
+ {
+ 'url': 'https://vine.co/u/941705360593584128',
+ 'only_matching': True,
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')
+ u = mobj.group('u')
- profile_url = "%sapi/users/profiles/vanity/%s" % (
- self._VINE_BASE_URL, user)
+ profile_url = "%sapi/users/profiles/%s%s" % (
+ self._VINE_BASE_URL, 'vanity/' if not u else '', user)
profile_data = self._download_json(
profile_url, user, note='Downloading user profile data')
user_id = profile_data['data']['userId']
timeline_data = []
for pagenum in itertools.count(1):
- timeline_url = "%sapi/timelines/users/%s?page=%s" % (
+ timeline_url = "%sapi/timelines/users/%s?page=%s&size=100" % (
self._VINE_BASE_URL, user_id, pagenum)
timeline_page = self._download_json(
timeline_url, user, note='Downloading page %d' % pagenum)
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 918bd1098..81e02a624 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -5,18 +5,22 @@ import re
import json
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+ compat_urllib_request,
+)
from ..utils import (
ExtractorError,
- compat_urllib_request,
- compat_urllib_parse,
- compat_str,
+ orderedSet,
unescapeHTML,
+ unified_strdate,
)
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
- _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
+ _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>[^s].*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk'
_TESTS = [
@@ -29,17 +33,19 @@ class VKIE(InfoExtractor):
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 're:Noize MC.*',
'duration': 195,
+ 'upload_date': '20120212',
},
},
{
- 'url': 'http://vk.com/video4643923_163339118',
- 'md5': 'f79bccb5cd182b1f43502ca5685b2b36',
+ 'url': 'http://vk.com/video205387401_165548505',
+ 'md5': '6c0aeb2e90396ba97035b9cbde548700',
'info_dict': {
- 'id': '163339118',
+ 'id': '165548505',
'ext': 'mp4',
- 'uploader': 'Elya Iskhakova',
- 'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
- 'duration': 558,
+ 'uploader': 'Tom Cruise',
+ 'title': 'No name',
+ 'duration': 9,
+ 'upload_date': '20130721'
}
},
{
@@ -52,9 +58,12 @@ class VKIE(InfoExtractor):
'uploader': 'Vladimir Gavrin',
'title': 'Lin Dan',
'duration': 101,
+ 'upload_date': '20120730',
}
},
{
+ # VIDEO NOW REMOVED
+ # please update if you find a video whose URL follows the same pattern
'url': 'http://vk.com/video-8871596_164049491',
'md5': 'a590bcaf3d543576c9bd162812387666',
'note': 'Only available for registered users',
@@ -64,18 +73,7 @@ class VKIE(InfoExtractor):
'uploader': 'Триллеры',
'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
'duration': 8352,
- },
- 'skip': 'Requires vk account credentials',
- },
- {
- 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
- 'md5': 'd82c22e449f036282d1d3f7f4d276869',
- 'info_dict': {
- 'id': '166094326',
- 'ext': 'mp4',
- 'uploader': 'Киномания - лучшее из мира кино',
- 'title': 'Запах женщины (1992)',
- 'duration': 9392,
+ 'upload_date': '20121218'
},
'skip': 'Requires vk account credentials',
},
@@ -88,6 +86,7 @@ class VKIE(InfoExtractor):
'uploader': 'Киномания - лучшее из мира кино',
'title': ' ',
'duration': 7291,
+ 'upload_date': '20140328',
},
'skip': 'Requires vk account credentials',
},
@@ -100,9 +99,15 @@ class VKIE(InfoExtractor):
'ext': 'mp4',
'title': 'Книга Илая',
'duration': 6771,
+ 'upload_date': '20140626',
},
'skip': 'Only works from Russia',
},
+ {
+ # removed video, just testing that we match the pattern
+ 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
+ 'only_matching': True,
+ },
]
def _login(self):
@@ -119,7 +124,7 @@ class VKIE(InfoExtractor):
}
request = compat_urllib_request.Request('https://login.vk.com/?act=login',
- compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ compat_urllib_parse.urlencode(login_form).encode('utf-8'))
login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
if re.search(r'onLoginFailed', login_page):
@@ -138,15 +143,35 @@ class VKIE(InfoExtractor):
info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
info_page = self._download_webpage(info_url, video_id)
- if re.search(r'<!>Please log in or <', info_page):
- raise ExtractorError('This video is only available for registered users, '
- 'use --username and --password options to provide account credentials.', expected=True)
+ ERRORS = {
+ r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
+ 'Video %s has been removed from public access due to rightholder complaint.',
+
+ r'<!>Please log in or <':
+ 'Video %s is only available for registered users, '
+ 'use --username and --password options to provide account credentials.',
+
+ r'<!>Unknown error':
+ 'Video %s does not exist.'
+ }
+
+ for error_re, error_msg in ERRORS.items():
+ if re.search(error_re, info_page):
+ raise ExtractorError(error_msg % video_id, expected=True)
m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
if m_yt is not None:
self.to_screen('Youtube video detected')
return self.url_result(m_yt.group(1), 'Youtube')
+ m_rutube = re.search(
+ r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
+ if m_rutube is not None:
+ self.to_screen('rutube video detected')
+ rutube_url = self._proto_relative_url(
+ m_rutube.group(1).replace('\\', ''))
+ return self.url_result(rutube_url)
+
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
if m_opts:
m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
@@ -159,6 +184,13 @@ class VKIE(InfoExtractor):
data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
data = json.loads(data_json)
+ # Extract upload date
+ upload_date = None
+ mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
+ if mobj is not None:
+ mobj.group(1) + ' ' + mobj.group(2)
+ upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
+
formats = [{
'format_id': k,
'url': v,
@@ -173,5 +205,28 @@ class VKIE(InfoExtractor):
'title': unescapeHTML(data['md_title']),
'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'),
- 'duration': data.get('duration')
+ 'duration': data.get('duration'),
+ 'upload_date': upload_date,
}
+
+
+class VKUserVideosIE(InfoExtractor):
+ IE_NAME = 'vk.com:user-videos'
+ IE_DESC = 'vk.com:All of a user\'s videos'
+ _VALID_URL = r'https?://vk\.com/videos(?P<id>[0-9]+)(?:m\?.*)?'
+ _TEMPLATE_URL = 'https://vk.com/videos'
+ _TEST = {
+ 'url': 'http://vk.com/videos205387401',
+ 'playlist_mincount': 4,
+ }
+
+ def _real_extract(self, url):
+ page_id = self._match_id(url)
+ page = self._download_webpage(url, page_id)
+ video_ids = orderedSet(
+ m.group(1) for m in re.finditer(r'href="/video([0-9_]+)"', page))
+ url_entries = [
+ self.url_result(
+ 'http://vk.com/video' + video_id, 'VK', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(url_entries, page_id)
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py
index affef6507..1c0966a79 100644
--- a/youtube_dl/extractor/vodlocker.py
+++ b/youtube_dl/extractor/vodlocker.py
@@ -2,8 +2,9 @@
from __future__ import unicode_literals
import re
+
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urllib_request,
)
@@ -24,8 +25,7 @@ class VodlockerIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
fields = dict(re.findall(r'''(?x)<input\s+
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
new file mode 100644
index 000000000..bbd3bbf7b
--- /dev/null
+++ b/youtube_dl/extractor/vrt.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class VRTIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:deredactie|sporza|cobra)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*'
+ _TESTS = [
+ # deredactie.be
+ {
+ 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/programmas/journaal/EP_141025_JOL',
+ 'md5': '4cebde1eb60a53782d4f3992cbd46ec8',
+ 'info_dict': {
+ 'id': '2129880',
+ 'ext': 'flv',
+ 'title': 'Het journaal L - 25/10/14',
+ 'description': None,
+ 'timestamp': 1414271750.949,
+ 'upload_date': '20141025',
+ 'duration': 929,
+ }
+ },
+ # sporza.be
+ {
+ 'url': 'http://sporza.be/cm/sporza/videozone/programmas/extratime/EP_141020_Extra_time',
+ 'md5': '11f53088da9bf8e7cfc42456697953ff',
+ 'info_dict': {
+ 'id': '2124639',
+ 'ext': 'flv',
+ 'title': 'Bekijk Extra Time van 20 oktober',
+ 'description': 'md5:83ac5415a4f1816c6a93f8138aef2426',
+ 'timestamp': 1413835980.560,
+ 'upload_date': '20141020',
+ 'duration': 3238,
+ }
+ },
+ # cobra.be
+ {
+ 'url': 'http://cobra.be/cm/cobra/videozone/rubriek/film-videozone/141022-mv-ellis-cafecorsari',
+ 'md5': '78a2b060a5083c4f055449a72477409d',
+ 'info_dict': {
+ 'id': '2126050',
+ 'ext': 'flv',
+ 'title': 'Bret Easton Ellis in Café Corsari',
+ 'description': 'md5:f699986e823f32fd6036c1855a724ee9',
+ 'timestamp': 1413967500.494,
+ 'upload_date': '20141022',
+ 'duration': 661,
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._search_regex(
+ r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False)
+
+ formats = []
+ mobj = re.search(
+ r'data-video-iphone-server="(?P<server>[^"]+)"\s+data-video-iphone-path="(?P<path>[^"]+)"',
+ webpage)
+ if mobj:
+ formats.extend(self._extract_m3u8_formats(
+ '%s/%s' % (mobj.group('server'), mobj.group('path')),
+ video_id, 'mp4'))
+ mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage)
+ if mobj:
+ formats.extend(self._extract_f4m_formats(
+ '%s/manifest.f4m' % mobj.group('src'), video_id))
+ self._sort_formats(formats)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage, default=None)
+ thumbnail = self._og_search_thumbnail(webpage)
+ timestamp = float_or_none(self._search_regex(
+ r'data-video-sitestat-pubdate="(\d+)"', webpage, 'timestamp', fatal=False), 1000)
+ duration = float_or_none(self._search_regex(
+ r'data-video-duration="(\d+)"', webpage, 'duration', fatal=False), 1000)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
index 1b2f731e9..405cb9db4 100644
--- a/youtube_dl/extractor/vube.py
+++ b/youtube_dl/extractor/vube.py
@@ -3,9 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+)
from ..utils import (
int_or_none,
- compat_str,
ExtractorError,
)
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
index ec3c010ad..c3fde53f5 100644
--- a/youtube_dl/extractor/vuclip.py
+++ b/youtube_dl/extractor/vuclip.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
+)
+from ..utils import (
ExtractorError,
parse_duration,
qualities,
@@ -25,10 +27,9 @@ class VuClipIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
ad_m = re.search(
r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage)
if ad_m:
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index 88bbbb219..c17bebd6e 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -10,14 +10,14 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+ _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
_TEST = {
'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
'info_dict': {
'title': 'Sinkhole of bureaucracy',
},
'playlist': [{
- 'md5': 'c3f4b4922ffa259243f68e928db2db8c',
+ 'md5': '79132cc09ec5309fa590ae46e4cc31bc',
'info_dict': {
'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
@@ -29,7 +29,7 @@ class WashingtonPostIE(InfoExtractor):
'upload_date': '20140322',
},
}, {
- 'md5': 'f645a07652c2950cd9134bb852c5f5eb',
+ 'md5': 'e1d5734c06865cc504ad99dc2de0d443',
'info_dict': {
'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
@@ -44,10 +44,9 @@ class WashingtonPostIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('id')
-
+ page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
+
title = self._og_search_title(webpage)
uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
entries = []
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index 54d37da61..313b9c15d 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -1,12 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
+import itertools
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_parse_qs,
compat_urlparse,
+)
+from ..utils import (
determine_ext,
unified_strdate,
)
@@ -65,6 +68,10 @@ class WDRIE(InfoExtractor):
'upload_date': '20140717',
},
},
+ {
+ 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html',
+ 'playlist_mincount': 146,
+ }
]
def _real_extract(self, url):
@@ -79,6 +86,27 @@ class WDRIE(InfoExtractor):
self.url_result(page_url + href, 'WDR')
for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)
]
+
+ if entries: # Playlist page
+ return self.playlist_result(entries, page_id)
+
+ # Overview page
+ entries = []
+ for page_num in itertools.count(2):
+ hrefs = re.findall(
+ r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
+ webpage)
+ entries.extend(
+ self.url_result(page_url + href, 'WDR')
+ for href in hrefs)
+ next_url_m = re.search(
+ r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
+ if not next_url_m:
+ break
+ next_url = page_url + next_url_m.group(1)
+ webpage = self._download_webpage(
+ next_url, page_id,
+ note='Downloading playlist page %d' % page_num)
return self.playlist_result(entries, page_id)
flashvars = compat_parse_qs(
@@ -141,8 +169,9 @@ class WDRMobileIE(InfoExtractor):
'title': mobj.group('title'),
'age_limit': int(mobj.group('age_limit')),
'url': url,
- 'ext': determine_ext(url),
- 'user_agent': 'mobile',
+ 'http_headers': {
+ 'User-Agent': 'mobile',
+ },
}
@@ -171,8 +200,7 @@ class WDRMausIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
param_code = self._html_search_regex(
@@ -223,5 +251,3 @@ class WDRMausIE(InfoExtractor):
'thumbnail': thumbnail,
'upload_date': upload_date,
}
-
-# TODO test _1 \ No newline at end of file
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
new file mode 100644
index 000000000..396cf4e83
--- /dev/null
+++ b/youtube_dl/extractor/webofstories.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class WebOfStoriesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)'
+ _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
+ _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
+ _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
+ _TESTS = [
+ {
+ 'url': 'http://www.webofstories.com/play/hans.bethe/71',
+ 'md5': '373e4dd915f60cfe3116322642ddf364',
+ 'info_dict': {
+ 'id': '4536',
+ 'ext': 'mp4',
+ 'title': 'The temperature of the sun',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Hans Bethe talks about calculating the temperature of the sun',
+ 'duration': 238,
+ }
+ },
+ {
+ 'url': 'http://www.webofstories.com/play/55908',
+ 'md5': '2985a698e1fe3211022422c4b5ed962c',
+ 'info_dict': {
+ 'id': '55908',
+ 'ext': 'mp4',
+ 'title': 'The story of Gemmata obscuriglobus',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+ 'duration': 169,
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ description = self._html_search_meta('description', webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ story_filename = self._search_regex(
+ r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename')
+ speaker_id = self._search_regex(
+ r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID')
+ story_id = self._search_regex(
+ r'\.storyId\((\d+)\)', webpage, 'story ID')
+ speaker_type = self._search_regex(
+ r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type')
+ great_life = self._search_regex(
+ r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story')
+ is_great_life_series = great_life == 'true'
+ duration = int_or_none(self._search_regex(
+ r'\.duration\((\d+)\)', webpage, 'duration', fatal=False))
+
+ # URL building, see: http://www.webofstories.com/scripts/player.js
+ ms_prefix = ''
+ if speaker_type.lower() == 'ms':
+ ms_prefix = 'mini_sites/'
+
+ if is_great_life_series:
+ mp4_url = '{0:}lives/{1:}/{2:}.mp4'.format(
+ self._VIDEO_DOMAIN, speaker_id, story_filename)
+ rtmp_ext = 'flv'
+ streamer = self._GREAT_LIFE_STREAMER
+ play_path = 'stories/{0:}/{1:}'.format(
+ speaker_id, story_filename)
+ else:
+ mp4_url = '{0:}{1:}{2:}/{3:}.mp4'.format(
+ self._VIDEO_DOMAIN, ms_prefix, speaker_id, story_filename)
+ rtmp_ext = 'mp4'
+ streamer = self._USER_STREAMER
+ play_path = 'mp4:{0:}{1:}/{2}.mp4'.format(
+ ms_prefix, speaker_id, story_filename)
+
+ formats = [{
+ 'format_id': 'mp4_sd',
+ 'url': mp4_url,
+ }, {
+ 'format_id': 'rtmp_sd',
+ 'page_url': url,
+ 'url': streamer,
+ 'ext': rtmp_ext,
+ 'play_path': play_path,
+ }]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': story_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py
index b24297a40..20bb039d3 100644
--- a/youtube_dl/extractor/weibo.py
+++ b/youtube_dl/extractor/weibo.py
@@ -41,7 +41,7 @@ class WeiboIE(InfoExtractor):
videos_urls = sorted(videos_urls, key=lambda u: 'video.sina.com' in u)
player_url = videos_urls[-1]
m_sina = re.match(r'https?://video\.sina\.com\.cn/v/b/(\d+)-\d+\.html',
- player_url)
+ player_url)
if m_sina is not None:
self.to_screen('Sina video detected')
sina_id = m_sina.group(1)
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index c27dda944..d6dec25ca 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -37,7 +37,7 @@ class WimpIE(InfoExtractor):
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL')
+ r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index 748443f81..13a079151 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -1,9 +1,8 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import ExtractorError, compat_urllib_request
+from ..compat import compat_urllib_request
+from ..utils import ExtractorError
class WistiaIE(InfoExtractor):
@@ -22,8 +21,7 @@ class WistiaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
request = compat_urllib_request.Request(self._API_URL.format(video_id))
request.add_header('Referer', url) # Some videos require this.
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index bda3870db..d5c26a032 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -51,4 +51,3 @@ class WorldStarHipHopIE(InfoExtractor):
'title': video_title,
'thumbnail': thumbnail,
}
-
diff --git a/youtube_dl/extractor/wrzuta.py b/youtube_dl/extractor/wrzuta.py
index 34dd6d952..c42764921 100644
--- a/youtube_dl/extractor/wrzuta.py
+++ b/youtube_dl/extractor/wrzuta.py
@@ -27,15 +27,15 @@ class WrzutaIE(InfoExtractor):
'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd',
},
}, {
- 'url': 'http://w729.wrzuta.pl/audio/9oXJqdcndqv/david_guetta_amp_showtek_ft._vassy_-_bad',
- 'md5': '1e546a18e1c22ac6e9adce17b8961ff5',
+ 'url': 'http://jolka85.wrzuta.pl/audio/063jOPX5ue2/liber_natalia_szroeder_-_teraz_ty',
+ 'md5': 'bc78077859bea7bcfe4295d7d7fc9025',
'info_dict': {
- 'id': '9oXJqdcndqv',
+ 'id': '063jOPX5ue2',
'ext': 'ogg',
- 'title': 'David Guetta & Showtek ft. Vassy - Bad',
- 'duration': 270,
- 'uploader_id': 'w729',
- 'description': 'md5:4628f01c666bbaaecefa83476cfa794a',
+ 'title': 'Liber & Natalia Szroeder - Teraz Ty',
+ 'duration': 203,
+ 'uploader_id': 'jolka85',
+ 'description': 'md5:2d2b6340f9188c8c4cd891580e481096',
},
}]
@@ -49,16 +49,17 @@ class WrzutaIE(InfoExtractor):
quality = qualities(['SD', 'MQ', 'HQ', 'HD'])
- audio_table = {'flv': 'mp3', 'webm': 'ogg'}
+ audio_table = {'flv': 'mp3', 'webm': 'ogg', '???': 'mp3'}
embedpage = self._download_json('http://www.wrzuta.pl/npp/embed/%s/%s' % (uploader, video_id), video_id)
formats = []
for media in embedpage['url']:
+ fmt = media['type'].split('@')[0]
if typ == 'audio':
- ext = audio_table[media['type'].split('@')[0]]
+ ext = audio_table.get(fmt, fmt)
else:
- ext = media['type'].split('@')[0]
+ ext = fmt
formats.append({
'format_id': '%s_%s' % (ext, media['quality'].lower()),
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
index 71bd7c463..80c48c37d 100644
--- a/youtube_dl/extractor/xbef.py
+++ b/youtube_dl/extractor/xbef.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -23,10 +21,9 @@ class XBefIE(InfoExtractor):
}
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
title = self._html_search_regex(
r'<h1[^>]*>(.*?)</h1>', webpage, 'title')
@@ -47,4 +44,3 @@ class XBefIE(InfoExtractor):
'thumbnail': thumbnail,
'age_limit': 18,
}
-
diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py
index a9aa72e73..236ff403b 100644
--- a/youtube_dl/extractor/xboxclips.py
+++ b/youtube_dl/extractor/xboxclips.py
@@ -1,46 +1,42 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
- parse_iso8601,
- float_or_none,
int_or_none,
+ parse_filesize,
+ unified_strdate,
)
class XboxClipsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/video\.php\?.*vid=(?P<id>[\w-]{36})'
+ _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
_TEST = {
'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
'info_dict': {
'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
'ext': 'mp4',
- 'title': 'Iabdulelah playing Upload Studio',
- 'filesize_approx': 28101836.8,
- 'timestamp': 1407388500,
+ 'title': 'Iabdulelah playing Titanfall',
+ 'filesize_approx': 26800000,
'upload_date': '20140807',
'duration': 56,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._html_search_regex(
- r'>Link: <a href="([^"]+)">', webpage, 'video URL')
+ r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL')
title = self._html_search_regex(
r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title')
- timestamp = parse_iso8601(self._html_search_regex(
+ upload_date = unified_strdate(self._html_search_regex(
r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False))
- filesize = float_or_none(self._html_search_regex(
- r'>Size: ([\d\.]+)MB<', webpage, 'file size', fatal=False), invscale=1024 * 1024)
+ filesize = parse_filesize(self._html_search_regex(
+ r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
duration = int_or_none(self._html_search_regex(
r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
@@ -50,7 +46,7 @@ class XboxClipsIE(InfoExtractor):
'id': video_id,
'url': video_url,
'title': title,
- 'timestamp': timestamp,
+ 'upload_date': upload_date,
'filesize_approx': filesize,
'duration': duration,
'view_count': view_count,
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 4e8fbde8d..4527567f8 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -14,7 +14,7 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
- _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
+ _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [
{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
@@ -39,10 +39,14 @@ class XHamsterIE(InfoExtractor):
'duration': 200,
'age_limit': 18,
}
- }
+ },
+ {
+ 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
+ 'only_matching': True,
+ },
]
- def _real_extract(self,url):
+ def _real_extract(self, url):
def extract_video_url(webpage):
mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
if mp4 is None:
@@ -57,7 +61,8 @@ class XHamsterIE(InfoExtractor):
video_id = mobj.group('id')
seo = mobj.group('seo')
- mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
+ proto = mobj.group('proto')
+ mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
@@ -67,17 +72,17 @@ class XHamsterIE(InfoExtractor):
description = mobj.group(1) if mobj else None
upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',
- webpage, 'upload date', fatal=False)
+ webpage, 'upload date', fatal=False)
if upload_date:
upload_date = unified_strdate(upload_date)
uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
- webpage, 'uploader id', default='anonymous')
+ webpage, 'uploader id', default='anonymous')
thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
- webpage, 'duration', fatal=False))
+ webpage, 'duration', fatal=False))
view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)
if view_count:
diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py
new file mode 100644
index 000000000..8c6241aed
--- /dev/null
+++ b/youtube_dl/extractor/xminus.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_chr,
+ compat_ord,
+)
+from ..utils import (
+ int_or_none,
+ parse_filesize,
+)
+
+
+class XMinusIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?x-minus\.org/track/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://x-minus.org/track/4542/%D0%BF%D0%B5%D1%81%D0%B5%D0%BD%D0%BA%D0%B0-%D1%88%D0%BE%D1%84%D0%B5%D1%80%D0%B0.html',
+ 'md5': '401a15f2d2dcf6d592cb95528d72a2a8',
+ 'info_dict': {
+ 'id': '4542',
+ 'ext': 'mp3',
+ 'title': 'Леонид Агутин-Песенка шофера',
+ 'duration': 156,
+ 'tbr': 320,
+ 'filesize_approx': 5900000,
+ 'view_count': int,
+ 'description': 'md5:03238c5b663810bc79cf42ef3c03e371',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ artist = self._html_search_regex(
+ r'minus_track\.artist="(.+?)"', webpage, 'artist')
+ title = artist + '-' + self._html_search_regex(
+ r'minus_track\.title="(.+?)"', webpage, 'title')
+ duration = int_or_none(self._html_search_regex(
+ r'minus_track\.dur_sec=\'([0-9]*?)\'',
+ webpage, 'duration', fatal=False))
+ filesize_approx = parse_filesize(self._html_search_regex(
+ r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])',
+ webpage, 'approximate filesize', fatal=False))
+ tbr = int_or_none(self._html_search_regex(
+ r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
+ webpage, 'bitrate', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'<div class="quality.*?► ([0-9]+)',
+ webpage, 'view count', fatal=False))
+ description = self._html_search_regex(
+ r'(?s)<div id="song_texts">(.*?)</div><br',
+ webpage, 'song lyrics', fatal=False)
+ if description:
+ description = re.sub(' *\r *', '\n', description)
+
+ enc_token = self._html_search_regex(
+ r'minus_track\.tkn="(.+?)"', webpage, 'enc_token')
+ token = ''.join(
+ c if pos == 3 else compat_chr(compat_ord(c) - 1)
+ for pos, c in enumerate(reversed(enc_token)))
+ video_url = 'http://x-minus.org/dwlf/%s/%s.mp3' % (video_id, token)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'filesize_approx': filesize_approx,
+ 'tbr': tbr,
+ 'view_count': view_count,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 7a73b2430..79ed6c744 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -1,10 +1,8 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
)
@@ -23,21 +21,18 @@ class XNXXIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- # Get webpage content
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'flv_url=(.*?)&amp;',
- webpage, 'video URL')
+ webpage, 'video URL')
video_url = compat_urllib_parse.unquote(video_url)
video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
- webpage, 'title')
+ webpage, 'title')
video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&amp;',
- webpage, 'thumbnail', fatal=False)
+ webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 273d93d9e..e8490b028 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -1,18 +1,20 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_request,
+ compat_urllib_parse,
+)
+from ..utils import (
parse_duration,
str_to_int,
)
class XTubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<id>[^/?&#]+))'
_TEST = {
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
@@ -20,7 +22,7 @@ class XTubeIE(InfoExtractor):
'id': 'kVTUy_G222_',
'ext': 'mp4',
'title': 'strange erotica',
- 'description': 'surreal gay themed erotica...almost an ET kind of thing',
+ 'description': 'http://www.xtube.com an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'age_limit': 18,
@@ -28,41 +30,49 @@ class XTubeIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ video_id = self._match_id(url)
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
+ video_title = self._html_search_regex(
+ r'<p class="title">([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
- r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
+ [r"var\s+contentOwnerId\s*=\s*'([^']+)",
+ r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'],
+ webpage, 'uploader', fatal=False)
video_description = self._html_search_regex(
- r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
+ r'<p class="fieldsDesc">([^<]+)',
+ webpage, 'description', fatal=False)
duration = parse_duration(self._html_search_regex(
- r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
- view_count = self._html_search_regex(
- r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
- if view_count:
- view_count = str_to_int(view_count)
- comment_count = self._html_search_regex(
- r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
- if comment_count:
- comment_count = str_to_int(comment_count)
-
- player_quality_option = json.loads(self._html_search_regex(
- r'playerQualityOption = ({.+?});', webpage, 'player quality option'))
-
- QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080']
- formats = [
- {
- 'url': furl,
+ r'<span class="bold">Runtime:</span> ([^<]+)</p>',
+ webpage, 'duration', fatal=False))
+ view_count = str_to_int(self._html_search_regex(
+ r'<span class="bold">Views:</span> ([\d,\.]+)</p>',
+ webpage, 'view count', fatal=False))
+ comment_count = str_to_int(self._html_search_regex(
+ r'<div id="commentBar">([\d,\.]+) Comments</div>',
+ webpage, 'comment count', fatal=False))
+
+ formats = []
+ for format_id, video_url in re.findall(
+ r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
+ fmt = {
+ 'url': compat_urllib_parse.unquote(video_url),
'format_id': format_id,
- 'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1,
- } for format_id, furl in player_quality_option.items()
- ]
+ }
+ m = re.search(r'^(?P<height>\d+)[pP]', format_id)
+ if m:
+ fmt['height'] = int(m.group('height'))
+ formats.append(fmt)
+
+ if not formats:
+ video_url = compat_urllib_parse.unquote(self._search_regex(
+ r'flashvars\.video_url\s*=\s*"([^"]+)"',
+ webpage, 'video URL'))
+ formats.append({'url': video_url})
+
self._sort_formats(formats)
return {
@@ -85,6 +95,7 @@ class XTubeUserIE(InfoExtractor):
'url': 'http://www.xtube.com/community/profile.php?user=greenshowers',
'info_dict': {
'id': 'greenshowers',
+ 'age_limit': 18,
},
'playlist_mincount': 155,
}
@@ -97,7 +108,7 @@ class XTubeUserIE(InfoExtractor):
url, username, note='Retrieving profile page')
video_count = int(self._search_regex(
- r'<strong>%s\'s Videos \(([0-9]+)\)</strong>'%username, profile_page,
+ r'<strong>%s\'s Videos \(([0-9]+)\)</strong>' % username, profile_page,
'video count'))
PAGE_SIZE = 25
@@ -114,6 +125,7 @@ class XTubeUserIE(InfoExtractor):
return {
'_type': 'playlist',
'id': username,
+ 'age_limit': 18,
'entries': [{
'_type': 'url',
'url': eurl,
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 7e0044824..2a45dc574 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -3,15 +3,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
- ExtractorError,
+)
+from ..utils import (
clean_html,
+ ExtractorError,
)
class XVideosIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
+ _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P<id>[0-9]+)(?:.*)'
_TEST = {
'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
'md5': '4b46ae6ea5e6e9086e714d883313c0c9',
@@ -24,37 +26,25 @@ class XVideosIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage)
if mobj:
raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
- # Extract video URL
video_url = compat_urllib_parse.unquote(
self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
-
- # Extract title
video_title = self._html_search_regex(
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
-
- # Extract video thumbnail
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
'url': video_url,
- 'uploader': None,
- 'upload_date': None,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
- 'description': None,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py
new file mode 100644
index 000000000..5c8f17eb2
--- /dev/null
+++ b/youtube_dl/extractor/xxxymovies.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+)
+
+
+class XXXYMoviesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?xxxymovies\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)'
+ _TEST = {
+ 'url': 'http://xxxymovies.com/videos/138669/ecstatic-orgasm-sofcore/',
+ 'md5': '810b1bdbbffff89dd13bdb369fe7be4b',
+ 'info_dict': {
+ 'id': '138669',
+ 'display_id': 'ecstatic-orgasm-sofcore',
+ 'ext': 'mp4',
+ 'title': 'Ecstatic Orgasm Sofcore',
+ 'duration': 931,
+ 'categories': list,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_url = self._search_regex(
+ r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')
+
+ title = self._html_search_regex(
+ [r'<div class="block_header">\s*<h1>([^<]+)</h1>',
+ r'<title>(.*?)\s*-\s*XXXYMovies\.com</title>'],
+ webpage, 'title')
+
+ thumbnail = self._search_regex(
+ r"preview_url\s*:\s*'([^']+)'",
+ webpage, 'thumbnail', fatal=False)
+
+ categories = self._html_search_meta(
+ 'keywords', webpage, 'categories', default='').split(',')
+
+ duration = parse_duration(self._search_regex(
+ r'<span>Duration:</span>\s*(\d+:\d+)',
+ webpage, 'duration', fatal=False))
+
+ view_count = int_or_none(self._html_search_regex(
+ r'<div class="video_views">\s*(\d+)',
+ webpage, 'view count', fatal=False))
+ like_count = int_or_none(self._search_regex(
+ r'>\s*Likes? <b>\((\d+)\)',
+ webpage, 'like count', fatal=False))
+ dislike_count = int_or_none(self._search_regex(
+ r'>\s*Dislike <b>\((\d+)\)</b>',
+ webpage, 'dislike count', fatal=False))
+
+ age_limit = self._rta_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 117f0856a..f8e7041a0 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -6,11 +6,14 @@ import json
import re
from .common import InfoExtractor, SearchInfoExtractor
-from ..utils import (
- ExtractorError,
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
+)
+from ..utils import (
clean_html,
+ unescapeHTML,
+ ExtractorError,
int_or_none,
)
@@ -53,14 +56,14 @@ class YahooIE(InfoExtractor):
}
},
{
- 'url': 'https://tw.screen.yahoo.com/taipei-opinion-poll/選情站報-街頭民調-台北市篇-102823042.html',
- 'md5': '92a7fdd8a08783c68a174d7aa067dde8',
+ 'url': 'https://tw.screen.yahoo.com/election-2014-askmayor/敢問市長-黃秀霜批賴清德-非常高傲-033009720.html',
+ 'md5': '3a09cf59349cfaddae1797acc3c087fc',
'info_dict': {
- 'id': '7a23b569-7bea-36cb-85b9-bd5301a0a1fb',
+ 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f',
'ext': 'mp4',
- 'title': '選情站報 街頭民調 台北市篇',
- 'description': '選情站報 街頭民調 台北市篇',
- 'duration': 429,
+ 'title': '敢問市長/黃秀霜批賴清德「非常高傲」',
+ 'description': '直言台南沒捷運 交通居五都之末',
+ 'duration': 396,
}
},
{
@@ -85,14 +88,14 @@ class YahooIE(InfoExtractor):
'duration': 121,
}
}, {
- 'url': 'https://ca.finance.yahoo.com/news/20-most-valuable-brands-world-112600775.html',
- 'md5': '3e401e4eed6325aa29d9b96125fd5b4f',
+ 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
+ 'md5': '226a895aae7e21b0129e2a2006fe9690',
'info_dict': {
- 'id': 'c1b4c09c-8ed8-3b65-8b05-169c55358a83',
+ 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
'ext': 'mp4',
- 'title': "Apple Is The World's Most Valuable Brand",
- 'description': 'md5:73eabc1a11c6f59752593b2ceefa1262',
- 'duration': 21,
+ 'title': '\'The Interview\' TV Spot: War',
+ 'description': 'The Interview',
+ 'duration': 30,
}
}, {
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
@@ -115,6 +118,16 @@ class YahooIE(InfoExtractor):
'duration': 201,
}
}, {
+ 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
+ 'md5': '989396ae73d20c6f057746fb226aa215',
+ 'info_dict': {
+ 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
+ 'ext': 'mp4',
+ 'title': '\'True Story\' Trailer',
+ 'description': 'True Story',
+ 'duration': 150,
+ },
+ }, {
'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
'only_matching': True,
}
@@ -123,6 +136,7 @@ class YahooIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
+ page_id = mobj.group('id')
url = mobj.group('url')
host = mobj.group('host')
webpage = self._download_webpage(url, display_id)
@@ -147,6 +161,7 @@ class YahooIE(InfoExtractor):
r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
r'"first_videoid"\s*:\s*"([^"]+)"',
+ r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
]
video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
else:
@@ -161,17 +176,15 @@ class YahooIE(InfoExtractor):
region = self._search_regex(
r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
webpage, 'region', fatal=False, default='US')
- query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
- ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="%s"'
- ' AND protocol="http"' % (video_id, region))
data = compat_urllib_parse.urlencode({
- 'q': query,
- 'env': 'prod',
- 'format': 'json',
+ 'protocol': 'http',
+ 'region': region,
})
+ query_url = (
+ 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/'
+ '{id}?{data}'.format(id=video_id, data=data))
query_result = self._download_json(
- 'http://video.query.yahoo.com/v1/public/yql?' + data,
- display_id, 'Downloading video info')
+ query_url, display_id, 'Downloading video info')
info = query_result['query']['results']['mediaObj'][0]
meta = info.get('meta')
@@ -209,7 +222,7 @@ class YahooIE(InfoExtractor):
return {
'id': video_id,
'display_id': display_id,
- 'title': meta['title'],
+ 'title': unescapeHTML(meta['title']),
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
@@ -229,7 +242,7 @@ class YahooSearchIE(SearchInfoExtractor):
for pagenum in itertools.count(0):
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
info = self._download_json(result_url, query,
- note='Downloading results page '+str(pagenum+1))
+ note='Downloading results page ' + str(pagenum + 1))
m = info['m']
results = info['results']
diff --git a/youtube_dl/extractor/yesjapan.py b/youtube_dl/extractor/yesjapan.py
new file mode 100644
index 000000000..112a6c030
--- /dev/null
+++ b/youtube_dl/extractor/yesjapan.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ HEADRequest,
+ get_element_by_attribute,
+ parse_iso8601,
+)
+
+
+class YesJapanIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yesjapan\.com/video/(?P<slug>[A-Za-z0-9\-]*)_(?P<id>[A-Za-z0-9]+)\.html'
+ _TEST = {
+ 'url': 'http://www.yesjapan.com/video/japanese-in-5-20-wa-and-ga-particle-usages_726497834.html',
+ 'md5': 'f0be416314e5be21a12b499b330c21cf',
+ 'info_dict': {
+ 'id': '726497834',
+ 'title': 'Japanese in 5! #20 - WA And GA Particle Usages',
+ 'description': 'This should clear up some issues most students of Japanese encounter with WA and GA....',
+ 'ext': 'mp4',
+ 'timestamp': 1416391590,
+ 'upload_date': '20141119',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ video_url = self._og_search_video_url(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ timestamp = None
+ submit_info = get_element_by_attribute('class', 'pm-submit-data', webpage)
+ if submit_info:
+ timestamp = parse_iso8601(self._search_regex(
+ r'datetime="([^"]+)"', submit_info, 'upload date', fatal=False, default=None))
+
+ # attempt to resolve the final URL in order to get a proper extension
+ redirect_req = HEADRequest(video_url)
+ req = self._request_webpage(
+ redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL', fatal=False)
+ if req:
+ video_url = req.geturl()
+
+ formats = [{
+ 'format_id': 'sd',
+ 'url': video_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py
index 944d7da38..894678a23 100644
--- a/youtube_dl/extractor/ynet.py
+++ b/youtube_dl/extractor/ynet.py
@@ -5,7 +5,7 @@ import re
import json
from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..compat import compat_urllib_parse
class YnetIE(InfoExtractor):
@@ -13,7 +13,6 @@ class YnetIE(InfoExtractor):
_TESTS = [
{
'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html',
- 'md5': '4b29cb57c3dddd57642b3f051f535b07',
'info_dict': {
'id': 'L-11659-99244',
'ext': 'flv',
@@ -22,7 +21,6 @@ class YnetIE(InfoExtractor):
}
}, {
'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html',
- 'md5': '8194c2ea221e9a639cac96b6b0753dc5',
'info_dict': {
'id': 'L-8859-84418',
'ext': 'flv',
@@ -49,4 +47,4 @@ class YnetIE(InfoExtractor):
'title': title,
'formats': self._extract_f4m_formats(f4m_url, video_id),
'thumbnail': self._og_search_thumbnail(webpage),
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py
index b86331e3c..c642075dc 100644
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -9,40 +9,30 @@ from ..utils import (
class YouJizzIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+)\.html$'
+ _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])'
_TEST = {
'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
- 'file': '2189178.flv',
'md5': '07e15fa469ba384c7693fd246905547c',
'info_dict': {
+ 'id': '2189178',
+ 'ext': 'flv',
"title": "Zeichentrick 1",
"age_limit": 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('videoid')
-
- # Get webpage content
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
-
age_limit = self._rta_search(webpage)
-
- # Get the video title
- video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
- webpage, 'title').strip()
-
- # Get the embed page
- result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
- if result is None:
- raise ExtractorError('ERROR: unable to extract embed page')
-
- embed_page_url = result.group(0).strip()
- video_id = result.group('videoid')
-
- webpage = self._download_webpage(embed_page_url, video_id)
+ video_title = self._html_search_regex(
+ r'<title>\s*(.*)\s*</title>', webpage, 'title')
+
+ embed_page_url = self._search_regex(
+ r'(https?://www.youjizz.com/videos/embed/[0-9]+)',
+ webpage, 'embed page')
+ webpage = self._download_webpage(
+ embed_page_url, video_id, note='downloading embed page')
# Get the video URL
m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage)
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 48d47a245..97b98bbe8 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -35,21 +35,21 @@ class YoukuIE(InfoExtractor):
def _gen_sid(self):
nowTime = int(time.time() * 1000)
- random1 = random.randint(1000,1998)
- random2 = random.randint(1000,9999)
+ random1 = random.randint(1000, 1998)
+ random2 = random.randint(1000, 9999)
- return "%d%d%d" %(nowTime,random1,random2)
+ return "%d%d%d" % (nowTime, random1, random2)
def _get_file_ID_mix_string(self, seed):
mixed = []
source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
seed = float(seed)
for i in range(len(source)):
- seed = (seed * 211 + 30031) % 65536
- index = math.floor(seed / 65536 * len(source))
+ seed = (seed * 211 + 30031) % 65536
+ index = math.floor(seed / 65536 * len(source))
mixed.append(source[int(index)])
source.remove(source[int(index)])
- #return ''.join(mixed)
+ # return ''.join(mixed)
return mixed
def _get_file_id(self, fileId, seed):
@@ -74,7 +74,7 @@ class YoukuIE(InfoExtractor):
# -8 means blocked outside China.
error = config['data'][0].get('error') # Chinese and English, separated by newline.
raise ExtractorError(error or 'Server reported error %i' % error_code,
- expected=True)
+ expected=True)
video_title = config['data'][0]['title']
seed = config['data'][0]['seed']
@@ -100,12 +100,12 @@ class YoukuIE(InfoExtractor):
keys = [s['k'] for s in config['data'][0]['segs'][format]]
# segs is usually a dictionary, but an empty *list* if an error occured.
- files_info=[]
+ files_info = []
sid = self._gen_sid()
fileid = self._get_file_id(fileid, seed)
- #column 8,9 of fileid represent the segment number
- #fileid[7:9] should be changed
+ # column 8,9 of fileid represent the segment number
+ # fileid[7:9] should be changed
for index, key in enumerate(keys):
temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 7bfda45e7..107c9ac36 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -6,10 +6,11 @@ import re
import sys
from .common import InfoExtractor
-from ..utils import (
+from ..compat import (
compat_urllib_parse_urlparse,
compat_urllib_request,
-
+)
+from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
@@ -45,11 +46,13 @@ class YouPornIE(InfoExtractor):
age_limit = self._rta_search(webpage)
# Get JSON parameters
- json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters')
+ json_params = self._search_regex(
+ r'var currentVideo = new Video\((.*)\)[,;]',
+ webpage, 'JSON parameters')
try:
params = json.loads(json_params)
except:
- raise ExtractorError(u'Invalid JSON')
+ raise ExtractorError('Invalid JSON')
self.report_extraction(video_id)
try:
@@ -64,7 +67,7 @@ class YouPornIE(InfoExtractor):
# Get all of the links from the page
DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
- webpage, 'download list').strip()
+ webpage, 'download list').strip()
LINK_RE = r'<a href="([^"]+)">'
links = re.findall(LINK_RE, download_list_html)
@@ -73,7 +76,7 @@ class YouPornIE(InfoExtractor):
for encrypted_link in encrypted_links:
link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8')
links.append(link)
-
+
formats = []
for link in links:
# A link looks like this:
@@ -103,8 +106,8 @@ class YouPornIE(InfoExtractor):
self._sort_formats(formats)
if not formats:
- raise ExtractorError(u'ERROR: no known formats available for video')
-
+ raise ExtractorError('ERROR: no known formats available for video')
+
return {
'id': video_id,
'uploader': video_uploader,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index cfae2de89..b7b91f354 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -7,47 +7,48 @@ import itertools
import json
import os.path
import re
+import time
import traceback
from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter
-from ..utils import (
+from ..compat import (
compat_chr,
compat_parse_qs,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
compat_str,
-
+)
+from ..utils import (
clean_html,
- get_element_by_id,
- get_element_by_attribute,
ExtractorError,
+ get_element_by_attribute,
+ get_element_by_id,
int_or_none,
OnDemandPagedList,
+ orderedSet,
unescapeHTML,
unified_strdate,
- orderedSet,
uppercase_escape,
)
+
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
- _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
def _set_language(self):
- return bool(self._download_webpage(
- self._LANG_URL, None,
- note='Setting language', errnote='unable to set language',
- fatal=False))
+ self._set_cookie(
+ '.youtube.com', 'PREF', 'f1=50000000&hl=en',
+ # YouTube sets the expire time to about two months
+ expire_time=time.time() + 2 * 30 * 24 * 3600)
def _login(self):
"""
@@ -76,30 +77,30 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# Log in
login_form_strs = {
- 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- 'Email': username,
- 'GALX': galx,
- 'Passwd': password,
-
- 'PersistentCookie': 'yes',
- '_utf8': '霱',
- 'bgresponse': 'js_disabled',
- 'checkConnection': '',
- 'checkedDomains': 'youtube',
- 'dnConn': '',
- 'pstMsg': '0',
- 'rmShown': '1',
- 'secTok': '',
- 'signIn': 'Sign in',
- 'timeStmp': '',
- 'service': 'youtube',
- 'uilel': '3',
- 'hl': 'en_US',
+ 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+ 'Email': username,
+ 'GALX': galx,
+ 'Passwd': password,
+
+ 'PersistentCookie': 'yes',
+ '_utf8': '霱',
+ 'bgresponse': 'js_disabled',
+ 'checkConnection': '',
+ 'checkedDomains': 'youtube',
+ 'dnConn': '',
+ 'pstMsg': '0',
+ 'rmShown': '1',
+ 'secTok': '',
+ 'signIn': 'Sign in',
+ 'timeStmp': '',
+ 'service': 'youtube',
+ 'uilel': '3',
+ 'hl': 'en_US',
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
@@ -149,7 +150,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'service': 'youtube',
'hl': 'en_US',
}
- tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
+ tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
@@ -175,27 +176,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return False
return True
- def _confirm_age(self):
- age_form = {
- 'next_url': '/',
- 'action_confirm': 'Confirm',
- }
- req = compat_urllib_request.Request(self._AGE_URL,
- compat_urllib_parse.urlencode(age_form).encode('ascii'))
-
- self._download_webpage(
- req, None,
- note='Confirming age', errnote='Unable to confirm age')
- return True
-
def _real_initialize(self):
if self._downloader is None:
return
- if not self._set_language():
- return
+ self._set_language()
if not self._login():
return
- self._confirm_age()
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
@@ -270,14 +256,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
- '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
'160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
+ '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
+ '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
# Dash mp4 audio
- '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
- '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
- '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
@@ -296,11 +285,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
# Dash webm audio
'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
'172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
+ # Dash webm audio with opus inside
+ '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
+ '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
+ '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
+
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
}
@@ -384,8 +383,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'info_dict': {
'id': 'IB3lcPjvWLA',
'ext': 'm4a',
- 'title': 'Afrojack - The Spark ft. Spree Wilson',
- 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
+ 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
+ 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d',
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
@@ -395,6 +394,108 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'format': '141',
},
},
+ # JS player signature function name containing $
+ {
+ 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
+ 'info_dict': {
+ 'id': 'nfWlot6h_JM',
+ 'ext': 'm4a',
+ 'title': 'Taylor Swift - Shake It Off',
+ 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
+ 'uploader': 'TaylorSwiftVEVO',
+ 'uploader_id': 'TaylorSwiftVEVO',
+ 'upload_date': '20140818',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '141',
+ },
+ },
+ # Controversy video
+ {
+ 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
+ 'info_dict': {
+ 'id': 'T4XJQO3qol8',
+ 'ext': 'mp4',
+ 'upload_date': '20100909',
+ 'uploader': 'The Amazing Atheist',
+ 'uploader_id': 'TheAmazingAtheist',
+ 'title': 'Burning Everyone\'s Koran',
+ 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
+ }
+ },
+ # Normal age-gate video (No vevo, embed allowed)
+ {
+ 'url': 'http://youtube.com/watch?v=HtVdAasjOgU',
+ 'info_dict': {
+ 'id': 'HtVdAasjOgU',
+ 'ext': 'mp4',
+ 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
+ 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
+ 'uploader': 'The Witcher',
+ 'uploader_id': 'WitcherGame',
+ 'upload_date': '20140605',
+ },
+ },
+ # Age-gate video with encrypted signature
+ {
+ 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU',
+ 'info_dict': {
+ 'id': '6kLq3WMV1nU',
+ 'ext': 'mp4',
+ 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
+ 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
+ 'uploader': 'LloydVEVO',
+ 'uploader_id': 'LloydVEVO',
+ 'upload_date': '20110629',
+ },
+ },
+ # video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
+ {
+ 'url': '__2ABJjxzNo',
+ 'info_dict': {
+ 'id': '__2ABJjxzNo',
+ 'ext': 'mp4',
+ 'upload_date': '20100430',
+ 'uploader_id': 'deadmau5',
+ 'description': 'md5:12c56784b8032162bb936a5f76d55360',
+ 'uploader': 'deadmau5',
+ 'title': 'Deadmau5 - Some Chords (HD)',
+ },
+ 'expected_warnings': [
+ 'DASH manifest missing',
+ ]
+ },
+ # Olympics (https://github.com/rg3/youtube-dl/issues/4431)
+ {
+ 'url': 'lqQg6PlCWgI',
+ 'info_dict': {
+ 'id': 'lqQg6PlCWgI',
+ 'ext': 'mp4',
+ 'upload_date': '20120731',
+ 'uploader_id': 'olympic',
+ 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
+ 'uploader': 'Olympics',
+ 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
+ },
+ 'params': {
+ 'skip_download': 'requires avconv',
+ }
+ },
+ # Non-square pixels
+ {
+ 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
+ 'info_dict': {
+ 'id': '_b-2C3KPAM0',
+ 'ext': 'mp4',
+ 'stretched_ratio': 16 / 9.,
+ 'upload_date': '20110310',
+ 'uploader_id': 'AllenMeow',
+ 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
+ 'uploader': '孫艾倫',
+ 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
+ },
+ }
]
def __init__(self, *args, **kwargs):
@@ -423,7 +524,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -467,13 +568,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def gen_sig_code(idxs):
def _genslice(start, end, step):
starts = '' if start == 0 else str(start)
- ends = (':%d' % (end+step)) if end + step >= 0 else ':'
+ ends = (':%d' % (end + step)) if end + step >= 0 else ':'
steps = '' if step == 1 else (':%d' % step)
return 's[%s%s%s]' % (starts, ends, steps)
step = None
- start = '(Never used)' # Quelch pyflakes warnings - start will be
- # set as soon as step is set
+ # Quelch pyflakes warnings - start will be set when step is set
+ start = '(Never used)'
for i, prev in zip(idxs[1:], idxs[:-1]):
if step is not None:
if i - prev == step:
@@ -504,8 +605,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
- r'signature=([$a-zA-Z]+)', jscode,
- 'Initial JS player signature function name')
+ r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode,
+ 'Initial JS player signature function name')
jsi = JSInterpreter(jscode)
initial_function = jsi.extract_function(funcname)
@@ -544,24 +645,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _get_available_subtitles(self, video_id, webpage):
try:
- sub_list = self._download_webpage(
+ subs_doc = self._download_xml(
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
video_id, note=False)
except ExtractorError as err:
self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
return {}
- lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
sub_lang_list = {}
- for l in lang_list:
- lang = l[1]
+ for track in subs_doc.findall('track'):
+ lang = track.attrib['lang_code']
if lang in sub_lang_list:
continue
params = compat_urllib_parse.urlencode({
'lang': lang,
'v': video_id,
'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
- 'name': unescapeHTML(l[0]).encode('utf-8'),
+ 'name': track.attrib['name'].encode('utf-8'),
})
url = 'https://www.youtube.com/api/timedtext?' + params
sub_lang_list[lang] = url
@@ -582,9 +682,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return {}
player_config = json.loads(mobj.group(1))
try:
- args = player_config[u'args']
- caption_url = args[u'ttsurl']
- timestamp = args[u'timestamp']
+ args = player_config['args']
+ caption_url = args['ttsurl']
+ timestamp = args['timestamp']
# We get the available subtitles
list_params = compat_urllib_parse.urlencode({
'type': 'list',
@@ -594,10 +694,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
list_url = caption_url + '&' + list_params
caption_list = self._download_xml(list_url, video_id)
original_lang_node = caption_list.find('track')
- if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
+ if original_lang_node is None:
self._downloader.report_warning('Video doesn\'t have automatic captions')
return {}
original_lang = original_lang_node.attrib['lang_code']
+ caption_kind = original_lang_node.attrib.get('kind', '')
sub_lang_list = {}
for lang_node in caption_list.findall('target'):
@@ -607,7 +708,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'tlang': sub_lang,
'fmt': sub_format,
'ts': timestamp,
- 'kind': 'asr',
+ 'kind': caption_kind,
})
sub_lang_list[sub_lang] = caption_url + '&' + params
return sub_lang_list
@@ -627,10 +728,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _extract_from_m3u8(self, manifest_url, video_id):
url_map = {}
+
def _get_urls(_manifest):
lines = _manifest.split('\n')
urls = filter(lambda l: l and not l.startswith('#'),
- lines)
+ lines)
return urls
manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
formats_urls = _get_urls(manifest)
@@ -643,6 +745,47 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+ def _parse_dash_manifest(
+ self, video_id, dash_manifest_url, player_url, age_gate):
+ def decrypt_sig(mobj):
+ s = mobj.group(1)
+ dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+ return '/signature/%s' % dec_s
+ dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+ dash_doc = self._download_xml(
+ dash_manifest_url, video_id,
+ note='Downloading DASH manifest',
+ errnote='Could not download DASH manifest')
+
+ formats = []
+ for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+ if url_el is None:
+ continue
+ format_id = r.attrib['id']
+ video_url = url_el.text
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+ f = {
+ 'format_id': format_id,
+ 'url': video_url,
+ 'width': int_or_none(r.attrib.get('width')),
+ 'height': int_or_none(r.attrib.get('height')),
+ 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+ 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+ 'filesize': filesize,
+ 'fps': int_or_none(r.attrib.get('frameRate')),
+ }
+ try:
+ existing_format = next(
+ fo for fo in formats
+ if fo['format_id'] == format_id)
+ except StopIteration:
+ f.update(self._formats.get(format_id, {}).items())
+ formats.append(f)
+ else:
+ existing_format.update(f)
+ return formats
+
def _real_extract(self, url):
proto = (
'http' if self._downloader.params.get('prefer_insecure', False)
@@ -655,17 +798,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
video_id = self.extract_id(url)
# Get video webpage
- url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
- pref_cookies = [
- c for c in self._downloader.cookiejar
- if c.domain == '.youtube.com' and c.name == 'PREF']
- for pc in pref_cookies:
- if 'hl=' in pc.value:
- pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
- else:
- if pc.value:
- pc.value += '&'
- pc.value += 'hl=en'
+ url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
@@ -676,34 +809,52 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
player_url = None
# Get video info
- self.report_video_info_webpage_download(video_id)
if re.search(r'player-age-gate-content">', video_webpage) is not None:
- self.report_age_confirmation()
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
+ url = proto + '://www.youtube.com/embed/%s' % video_id
+ embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
data = compat_urllib_parse.urlencode({
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'sts': self._search_regex(
- r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
+ r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
})
video_info_url = proto + '://www.youtube.com/get_video_info?' + data
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
+ video_info_webpage = self._download_webpage(
+ video_info_url, video_id,
+ note='Refetching age-gated info webpage',
+ errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
else:
age_gate = False
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
- % (video_id, el_type))
- video_info_webpage = self._download_webpage(video_info_url, video_id,
- note=False,
- errnote='unable to download video info webpage')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
- break
+ try:
+ # Try looking directly into the video webpage
+ mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
+ if not mobj:
+ raise ValueError('Could not find ytplayer.config') # caught below
+ json_code = uppercase_escape(mobj.group(1))
+ ytplayer_config = json.loads(json_code)
+ args = ytplayer_config['args']
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ if 'url_encoded_fmt_stream_map' not in args:
+ raise ValueError('No stream_map present') # caught below
+ except ValueError:
+ # We fallback to the get_video_info pages (used by the embed page)
+ self.report_video_info_webpage_download(video_id)
+ for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ video_info_url = (
+ '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ % (proto, video_id, el_type))
+ video_info_webpage = self._download_webpage(
+ video_info_url,
+ video_id, note=False,
+ errnote='unable to download video info webpage')
+ video_info = compat_parse_qs(video_info_webpage)
+ if 'token' in video_info:
+ break
if 'token' not in video_info:
if 'reason' in video_info:
raise ExtractorError(
@@ -771,7 +922,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
- video_webpage, 'categories', fatal=False)
+ video_webpage, 'categories', default=None)
if m_cat_container:
category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
@@ -826,33 +977,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
- video_annotations = self._extract_annotations(video_id)
-
- # Decide which formats to download
- try:
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if not mobj:
- raise ValueError('Could not find vevo ID')
- json_code = uppercase_escape(mobj.group(1))
- ytplayer_config = json.loads(json_code)
- args = ytplayer_config['args']
- # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
- # this signatures are encrypted
- if 'url_encoded_fmt_stream_map' not in args:
- raise ValueError('No stream_map present') # caught below
- re_signature = re.compile(r'[&,]s=')
- m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
- if m_s is not None:
- self.to_screen('%s: Encrypted signatures detected.' % video_id)
- video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
- m_s = re_signature.search(args.get('adaptive_fmts', ''))
- if m_s is not None:
- if 'adaptive_fmts' in video_info:
- video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
- else:
- video_info['adaptive_fmts'] = [args['adaptive_fmts']]
- except ValueError:
- pass
+ video_annotations = self._extract_annotations(video_id)
def _map_to_format_list(urlmap):
formats = []
@@ -875,8 +1000,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'url': video_info['conn'][0],
'player_url': player_url,
}]
- elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
- encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
+ elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:
+ encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
url_map = {}
@@ -892,11 +1017,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
- if not age_gate:
- jsplayer_url_json = self._search_regex(
- r'"assets":.+?"js":\s*("[^"]+")',
- video_webpage, 'JS player URL')
- player_url = json.loads(jsplayer_url_json)
+ jsplayer_url_json = self._search_regex(
+ r'"assets":.+?"js":\s*("[^"]+")',
+ embed_webpage if age_gate else video_webpage, 'JS player URL')
+ player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
@@ -922,7 +1046,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' %
- (format_id, parts_sizes, player_desc))
+ (format_id, parts_sizes, player_desc))
signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate)
@@ -940,75 +1064,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
- try:
- # The DASH manifest used needs to be the one from the original video_webpage.
- # The one found in get_video_info seems to be using different signatures.
- # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
- # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
- # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
- if age_gate:
- dash_manifest_url = video_info.get('dashmpd')[0]
+ dash_mpd = video_info.get('dashmpd')
+ if dash_mpd:
+ dash_manifest_url = dash_mpd[0]
+ try:
+ dash_formats = self._parse_dash_manifest(
+ video_id, dash_manifest_url, player_url, age_gate)
+ except (ExtractorError, KeyError) as e:
+ self.report_warning(
+ 'Skipping DASH manifest: %r' % e, video_id)
else:
- dash_manifest_url = ytplayer_config['args']['dashmpd']
- def decrypt_sig(mobj):
- s = mobj.group(1)
- dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
- return '/signature/%s' % dec_s
- dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
- dash_doc = self._download_xml(
- dash_manifest_url, video_id,
- note='Downloading DASH manifest',
- errnote='Could not download DASH manifest')
- for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
- url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
- if url_el is None:
- continue
- format_id = r.attrib['id']
- video_url = url_el.text
- filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
- f = {
- 'format_id': format_id,
- 'url': video_url,
- 'width': int_or_none(r.attrib.get('width')),
- 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
- 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
- 'filesize': filesize,
- }
- try:
- existing_format = next(
- fo for fo in formats
- if fo['format_id'] == format_id)
- except StopIteration:
- f.update(self._formats.get(format_id, {}))
- formats.append(f)
- else:
- existing_format.update(f)
-
- except (ExtractorError, KeyError) as e:
- self.report_warning('Skipping DASH manifest: %s' % e, video_id)
+ # Hide the formats we found through non-DASH
+ dash_keys = set(df['format_id'] for df in dash_formats)
+ for f in formats:
+ if f['format_id'] in dash_keys:
+ f['format_id'] = 'nondash-%s' % f['format_id']
+ f['preference'] = f.get('preference', 0) - 10000
+ formats.extend(dash_formats)
+
+ # Check for malformed aspect ratio
+ stretched_m = re.search(
+ r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
+ video_webpage)
+ if stretched_m:
+ ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
+ for f in formats:
+ if f.get('vcodec') != 'none':
+ f['stretched_ratio'] = ratio
self._sort_formats(formats)
return {
- 'id': video_id,
- 'uploader': video_uploader,
- 'uploader_id': video_uploader_id,
- 'upload_date': upload_date,
- 'title': video_title,
- 'thumbnail': video_thumbnail,
- 'description': video_description,
- 'categories': video_categories,
- 'subtitles': video_subtitles,
- 'duration': video_duration,
- 'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations,
+ 'id': video_id,
+ 'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'categories': video_categories,
+ 'subtitles': video_subtitles,
+ 'duration': video_duration,
+ 'age_limit': 18 if age_gate else 0,
+ 'annotations': video_annotations,
'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
- 'view_count': view_count,
+ 'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
- 'formats': formats,
+ 'formats': formats,
}
+
class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com playlists'
_VALID_URL = r"""(?x)(?:
@@ -1022,7 +1128,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
)
(
(?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
- # Top tracks, they can also include dots
+ # Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.*
@@ -1030,13 +1136,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
)"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _MORE_PAGES_INDICATOR = r'data-link-type="next"'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
'info_dict': {
'title': 'ytdl test PL',
+ 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
},
'playlist_count': 3,
}, {
@@ -1056,7 +1162,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
'note': 'issue #673',
'url': 'PLBB231211A4F62143',
'info_dict': {
- 'title': 'Team Fortress 2 (Class-based LP)',
+ 'title': '[OLD]Team Fortress 2 (Class-based LP)',
},
'playlist_mincount': 26,
}, {
@@ -1086,6 +1192,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
'info_dict': {
'title': 'JODA7',
}
+ }, {
+ 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
+ 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
+ 'info_dict': {
+ 'title': 'Uploads from Interstellar Movie',
+ },
+ 'playlist_mincout': 21,
}]
def _real_initialize(self):
@@ -1136,9 +1249,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
if playlist_id.startswith('RD'):
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
- if playlist_id.startswith('TL'):
- raise ExtractorError('For downloading YouTube.com top lists, use '
- 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
@@ -1170,6 +1280,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
'Downloading page #%s' % page_num,
transform_source=uppercase_escape)
content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
more_widget_html = more['load_more_widget_html']
playlist_title = self._html_search_regex(
@@ -1180,54 +1294,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self.playlist_result(url_results, playlist_id, playlist_title)
-class YoutubeTopListIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:toplist'
- IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
- ' (Example: "yttoplist:music:Top Tracks")')
- _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
- _TESTS = [{
- 'url': 'yttoplist:music:Trending',
- 'playlist_mincount': 5,
- 'skip': 'Only works for logged-in users',
- }]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel = mobj.group('chann')
- title = mobj.group('title')
- query = compat_urllib_parse.urlencode({'title': title})
- channel_page = self._download_webpage(
- 'https://www.youtube.com/%s' % channel, title)
- link = self._html_search_regex(
- r'''(?x)
- <a\s+href="([^"]+)".*?>\s*
- <span\s+class="branded-page-module-title-text">\s*
- <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
- channel_page, 'list')
- url = compat_urlparse.urljoin('https://www.youtube.com/', link)
-
- video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
- ids = []
- # sometimes the webpage doesn't contain the videos
- # retry until we get them
- for i in itertools.count(0):
- msg = 'Downloading Youtube mix'
- if i > 0:
- msg += ', retry #%d' % i
-
- webpage = self._download_webpage(url, title, msg)
- ids = orderedSet(re.findall(video_re, webpage))
- if ids:
- break
- url_results = self._ids_to_results(ids)
- return self.playlist_result(url_results, playlist_title=title)
-
-
class YoutubeChannelIE(InfoExtractor):
IE_DESC = 'YouTube.com channels'
- _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
- _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
- _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
+ _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
IE_NAME = 'youtube:channel'
_TESTS = [{
'note': 'paginated channel',
@@ -1243,13 +1312,8 @@ class YoutubeChannelIE(InfoExtractor):
return ids_in_page
def _real_extract(self, url):
- # Extract channel id
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
+ channel_id = self._match_id(url)
- # Download channel page
- channel_id = mobj.group(1)
video_ids = []
url = 'https://www.youtube.com/channel/%s/videos' % channel_id
channel_page = self._download_webpage(url, channel_id)
@@ -1263,30 +1327,39 @@ class YoutubeChannelIE(InfoExtractor):
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
video_ids = self.extract_videos_from_page(channel_page)
- else:
- # Download all channel pages using the json-based channel_ajax query
+ entries = [
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(entries, channel_id)
+
+ def _entries():
+ more_widget_html = content_html = channel_page
for pagenum in itertools.count(1):
- url = self._MORE_PAGES_URL % (pagenum, channel_id)
- page = self._download_json(
- url, channel_id, note='Downloading page #%s' % pagenum,
- transform_source=uppercase_escape)
- ids_in_page = self.extract_videos_from_page(page['content_html'])
- video_ids.extend(ids_in_page)
-
- if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
+ ids_in_page = self.extract_videos_from_page(content_html)
+ for video_id in ids_in_page:
+ yield self.url_result(
+ video_id, 'Youtube', video_id=video_id)
+
+ mobj = re.search(
+ r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
+ more_widget_html)
+ if not mobj:
break
- self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), channel_id,
+ 'Downloading page #%s' % (pagenum + 1),
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
- url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
- return self.playlist_result(url_entries, channel_id)
+ return self.playlist_result(_entries(), channel_id)
class YoutubeUserIE(InfoExtractor):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
@@ -1308,16 +1381,13 @@ class YoutubeUserIE(InfoExtractor):
# Don't return True if the url can be extracted with other youtube
# extractor, the regex would is too permissive and it would match.
other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
- if any(ie.suitable(url) for ie in other_ies): return False
- else: return super(YoutubeUserIE, cls).suitable(url)
+ if any(ie.suitable(url) for ie in other_ies):
+ return False
+ else:
+ return super(YoutubeUserIE, cls).suitable(url)
def _real_extract(self, url):
- # Extract username
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
-
- username = mobj.group(1)
+ username = self._match_id(url)
# Download video ids using YouTube Data API. Result size per
# query is limited (currently to 50 videos) so we need to query
@@ -1514,9 +1584,11 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_entries = []
paging = 0
for i in itertools.count(1):
- info = self._download_json(self._FEED_TEMPLATE % paging,
- '%s feed' % self._FEED_NAME,
- 'Downloading page %s' % i)
+ info = self._download_json(
+ self._FEED_TEMPLATE % paging,
+ '%s feed' % self._FEED_NAME,
+ 'Downloading page %s' % i,
+ transform_source=uppercase_escape)
feed_html = info.get('feed_html') or info.get('content_html')
load_more_widget_html = info.get('load_more_widget_html') or feed_html
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
@@ -1532,29 +1604,33 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
paging = mobj.group('paging')
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
+
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
+ IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
_PLAYLIST_TITLE = 'Youtube Recommended videos'
+
class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
+ IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
_FEED_NAME = 'watch_later'
_PLAYLIST_TITLE = 'Youtube Watch Later'
_PERSONAL_FEED = True
+
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
+ IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
_FEED_NAME = 'history'
_PERSONAL_FEED = True
_PLAYLIST_TITLE = 'Youtube Watch History'
+
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube:favorites'
- IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
+ IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
_LOGIN_REQUIRED = True
@@ -1606,11 +1682,17 @@ class YoutubeTruncatedURLIE(InfoExtractor):
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list
_VALID_URL = r'''(?x)
- (?:https?://)?[^/]+/watch\?(?:
+ (?:https?://)?
+ (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
+ (?:watch\?(?:
feature=[a-z_]+|
- annotation_id=annotation_[^&]+
- )?$|
- (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
+ annotation_id=annotation_[^&]+|
+ x-yt-cl=[0-9]+|
+ )?
+ |
+ attribution_link\?a=[^&]+
+ )
+ $
'''
_TESTS = [{
@@ -1619,6 +1701,12 @@ class YoutubeTruncatedURLIE(InfoExtractor):
}, {
'url': 'http://www.youtube.com/watch?',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?feature=foo',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -1629,3 +1717,20 @@ class YoutubeTruncatedURLIE(InfoExtractor):
'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
' or simply youtube-dl BaW_jenozKc .',
expected=True)
+
+
+class YoutubeTruncatedIDIE(InfoExtractor):
+ IE_NAME = 'youtube:truncated_id'
+ IE_DESC = False # Do not list
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
+
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ raise ExtractorError(
+ 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
+ expected=True)
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 3b1ac4e9f..98f15177b 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,17 +1,95 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
+ OnDemandPagedList,
)
+def extract_from_xml_url(ie, video_id, xml_url):
+ doc = ie._download_xml(
+ xml_url, video_id,
+ note='Downloading video info',
+ errnote='Failed to download video info')
+
+ title = doc.find('.//information/title').text
+ description = doc.find('.//information/detail').text
+ duration = int(doc.find('.//details/lengthSec').text)
+ uploader_node = doc.find('.//details/originChannelTitle')
+ uploader = None if uploader_node is None else uploader_node.text
+ uploader_id_node = doc.find('.//details/originChannelId')
+ uploader_id = None if uploader_id_node is None else uploader_id_node.text
+ upload_date = unified_strdate(doc.find('.//details/airtime').text)
+
+ def xml_to_format(fnode):
+ video_url = fnode.find('url').text
+ is_available = 'http://www.metafilegenerator' not in video_url
+
+ format_id = fnode.attrib['basetype']
+ format_m = re.match(r'''(?x)
+ (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+ ''', format_id)
+
+ ext = format_m.group('container')
+ proto = format_m.group('proto').lower()
+
+ quality = fnode.find('./quality').text
+ abr = int(fnode.find('./audioBitrate').text) // 1000
+ vbr_node = fnode.find('./videoBitrate')
+ vbr = None if vbr_node is None else int(vbr_node.text) // 1000
+
+ width_node = fnode.find('./width')
+ width = None if width_node is None else int_or_none(width_node.text)
+ height_node = fnode.find('./height')
+ height = None if height_node is None else int_or_none(height_node.text)
+
+ format_note = ''
+ if not format_note:
+ format_note = None
+
+ return {
+ 'format_id': format_id + '-' + quality,
+ 'url': video_url,
+ 'ext': ext,
+ 'acodec': format_m.group('acodec'),
+ 'vcodec': format_m.group('vcodec'),
+ 'abr': abr,
+ 'vbr': vbr,
+ 'width': width,
+ 'height': height,
+ 'filesize': int_or_none(fnode.find('./filesize').text),
+ 'format_note': format_note,
+ 'protocol': proto,
+ '_available': is_available,
+ }
+
+ format_nodes = doc.findall('.//formitaeten/formitaet')
+ formats = list(filter(
+ lambda f: f['_available'],
+ map(xml_to_format, format_nodes)))
+ ie._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
+
+
class ZDFIE(InfoExtractor):
- _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
+ _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
_TEST = {
'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',
@@ -29,81 +107,53 @@ class ZDFIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
-
+ video_id = self._match_id(url)
xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+ return extract_from_xml_url(self, video_id, xml_url)
+
+
+class ZDFChannelIE(InfoExtractor):
+ _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/)(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic',
+ 'info_dict': {
+ 'id': '1586442',
+ },
+ 'playlist_count': 3,
+ }
+ _PAGE_SIZE = 50
+
+ def _fetch_page(self, channel_id, page):
+ offset = page * self._PAGE_SIZE
+ xml_url = (
+ 'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s'
+ % (offset, self._PAGE_SIZE, channel_id))
doc = self._download_xml(
- xml_url, video_id,
- note='Downloading video info',
- errnote='Failed to download video info')
+ xml_url, channel_id,
+ note='Downloading channel info',
+ errnote='Failed to download channel info')
title = doc.find('.//information/title').text
description = doc.find('.//information/detail').text
- duration = int(doc.find('.//details/lengthSec').text)
- uploader_node = doc.find('.//details/originChannelTitle')
- uploader = None if uploader_node is None else uploader_node.text
- uploader_id_node = doc.find('.//details/originChannelId')
- uploader_id = None if uploader_id_node is None else uploader_id_node.text
- upload_date = unified_strdate(doc.find('.//details/airtime').text)
-
- def xml_to_format(fnode):
- video_url = fnode.find('url').text
- is_available = 'http://www.metafilegenerator' not in video_url
-
- format_id = fnode.attrib['basetype']
- format_m = re.match(r'''(?x)
- (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
- (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
- ''', format_id)
-
- ext = format_m.group('container')
- proto = format_m.group('proto').lower()
-
- quality = fnode.find('./quality').text
- abr = int(fnode.find('./audioBitrate').text) // 1000
- vbr_node = fnode.find('./videoBitrate')
- vbr = None if vbr_node is None else int(vbr_node.text) // 1000
-
- width_node = fnode.find('./width')
- width = None if width_node is None else int_or_none(width_node.text)
- height_node = fnode.find('./height')
- height = None if height_node is None else int_or_none(height_node.text)
-
- format_note = ''
- if not format_note:
- format_note = None
-
- return {
- 'format_id': format_id + '-' + quality,
- 'url': video_url,
- 'ext': ext,
- 'acodec': format_m.group('acodec'),
- 'vcodec': format_m.group('vcodec'),
- 'abr': abr,
- 'vbr': vbr,
- 'width': width,
- 'height': height,
- 'filesize': int_or_none(fnode.find('./filesize').text),
- 'format_note': format_note,
- 'protocol': proto,
- '_available': is_available,
+ for asset in doc.findall('.//teasers/teaser'):
+ a_type = asset.find('./type').text
+ a_id = asset.find('./details/assetId').text
+ if a_type not in ('video', 'topic'):
+ continue
+ yield {
+ '_type': 'url',
+ 'playlist_title': title,
+ 'playlist_description': description,
+ 'url': 'zdf:%s:%s' % (a_type, a_id),
}
- format_nodes = doc.findall('.//formitaeten/formitaet')
- formats = list(filter(
- lambda f: f['_available'],
- map(xml_to_format, format_nodes)))
-
- self._sort_formats(formats)
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE)
return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'upload_date': upload_date,
- 'formats': formats,
- } \ No newline at end of file
+ '_type': 'playlist',
+ 'id': channel_id,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py
new file mode 100644
index 000000000..1afbe68ed
--- /dev/null
+++ b/youtube_dl/extractor/zingmp3.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class ZingMp3BaseInfoExtractor(InfoExtractor):
+
+ @staticmethod
+ def _extract_item(item):
+ title = item.find('./title').text.strip()
+ source = item.find('./source').text
+ extension = item.attrib['type']
+ thumbnail = item.find('./backimage').text
+
+ return {
+ 'title': title,
+ 'url': source,
+ 'ext': extension,
+ 'thumbnail': thumbnail,
+ }
+
+ def _extract_player_xml(self, player_xml_url, id, playlist_title=None):
+ player_xml = self._download_xml(player_xml_url, id, 'Downloading Player XML')
+ items = player_xml.findall('./item')
+
+ if len(items) == 1:
+ # one single song
+ data = self._extract_item(items[0])
+ data['id'] = id
+
+ return data
+ else:
+ # playlist of songs
+ entries = []
+
+ for i, item in enumerate(items, 1):
+ entry = self._extract_item(item)
+ entry['id'] = '%s-%d' % (id, i)
+ entries.append(entry)
+
+ return {
+ '_type': 'playlist',
+ 'id': id,
+ 'title': playlist_title,
+ 'entries': entries,
+ }
+
+
+class ZingMp3SongIE(ZingMp3BaseInfoExtractor):
+ _VALID_URL = r'https?://mp3\.zing\.vn/bai-hat/(?P<slug>[^/]+)/(?P<song_id>\w+)\.html'
+ _TESTS = [{
+ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
+ 'md5': 'ead7ae13693b3205cbc89536a077daed',
+ 'info_dict': {
+ 'id': 'ZWZB9WAB',
+ 'title': 'Xa Mãi Xa',
+ 'ext': 'mp3',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }]
+ IE_NAME = 'zingmp3:song'
+ IE_DESC = 'mp3.zing.vn songs'
+
+ def _real_extract(self, url):
+ matched = re.match(self._VALID_URL, url)
+ slug = matched.group('slug')
+ song_id = matched.group('song_id')
+
+ webpage = self._download_webpage(
+ 'http://mp3.zing.vn/bai-hat/%s/%s.html' % (slug, song_id), song_id)
+
+ player_xml_url = self._search_regex(
+ r'&amp;xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url')
+
+ return self._extract_player_xml(player_xml_url, song_id)
+
+
+class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor):
+ _VALID_URL = r'https?://mp3\.zing\.vn/album/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html'
+ _TESTS = [{
+ 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html',
+ 'info_dict': {
+ '_type': 'playlist',
+ 'id': 'ZWZBWDAF',
+ 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless',
+ },
+ 'playlist_count': 10,
+ }]
+ IE_NAME = 'zingmp3:album'
+ IE_DESC = 'mp3.zing.vn albums'
+
+ def _real_extract(self, url):
+ matched = re.match(self._VALID_URL, url)
+ slug = matched.group('slug')
+ album_id = matched.group('album_id')
+
+ webpage = self._download_webpage(
+ 'http://mp3.zing.vn/album/%s/%s.html' % (slug, album_id), album_id)
+ player_xml_url = self._search_regex(
+ r'&amp;xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url')
+
+ return self._extract_player_xml(
+ player_xml_url, album_id,
+ playlist_title=self._og_search_title(webpage))
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
index c40cd376d..b4617fbad 100644
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@@ -61,7 +61,7 @@ class JSInterpreter(object):
pass
m = re.match(
- r'^(?P<var>[a-zA-Z0-9_]+)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$',
+ r'^(?P<var>[$a-zA-Z0-9_]+)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$',
expr)
if m:
variable = m.group('var')
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 649361bde..a3b012ddb 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -5,6 +5,12 @@ import optparse
import shlex
import sys
+from .downloader.external import list_external_downloaders
+from .compat import (
+ compat_expanduser,
+ compat_getenv,
+ compat_kwargs,
+)
from .utils import (
get_term_width,
write_string,
@@ -27,19 +33,19 @@ def parseOpts(overrideArguments=None):
return res
def _readUserConf():
- xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
+ xdg_config_home = compat_getenv('XDG_CONFIG_HOME')
if xdg_config_home:
userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config')
if not os.path.isfile(userConfFile):
userConfFile = os.path.join(xdg_config_home, 'youtube-dl.conf')
else:
- userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config')
+ userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl', 'config')
if not os.path.isfile(userConfFile):
- userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
+ userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dl.conf')
userConf = _readOptions(userConfFile, None)
if userConf is None:
- appdata_dir = os.environ.get('appdata')
+ appdata_dir = compat_getenv('appdata')
if appdata_dir:
userConf = _readOptions(
os.path.join(appdata_dir, 'youtube-dl', 'config'),
@@ -51,11 +57,11 @@ def parseOpts(overrideArguments=None):
if userConf is None:
userConf = _readOptions(
- os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'),
+ os.path.join(compat_expanduser('~'), 'youtube-dl.conf'),
default=None)
if userConf is None:
userConf = _readOptions(
- os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'),
+ os.path.join(compat_expanduser('~'), 'youtube-dl.conf.txt'),
default=None)
if userConf is None:
@@ -104,11 +110,11 @@ def parseOpts(overrideArguments=None):
kw = {
'version': __version__,
'formatter': fmt,
- 'usage': '%prog [options] url [url...]',
+ 'usage': '%prog [OPTIONS] URL [URL...]',
'conflict_handler': 'resolve',
}
- parser = optparse.OptionParser(**kw)
+ parser = optparse.OptionParser(**compat_kwargs(kw))
general = optparse.OptionGroup(parser, 'General Options')
general.add_option(
@@ -144,21 +150,46 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='list_extractor_descriptions', default=False,
help='Output descriptions of all supported extractors')
general.add_option(
- '--proxy', dest='proxy',
- default=None, metavar='URL',
- help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
- general.add_option(
- '--socket-timeout',
- dest='socket_timeout', type=float, default=None,
- help='Time to wait before giving up, in seconds')
- general.add_option(
'--default-search',
dest='default_search', metavar='PREFIX',
help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.')
general.add_option(
'--ignore-config',
action='store_true',
- help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
+ help='Do not read configuration files. '
+ 'When given in the global configuration file /etc/youtube-dl.conf: '
+ 'Do not read the user configuration in ~/.config/youtube-dl/config '
+ '(%APPDATA%/youtube-dl/config.txt on Windows)')
+ general.add_option(
+ '--flat-playlist',
+ action='store_const', dest='extract_flat', const='in_playlist',
+ default=False,
+ help='Do not extract the videos of a playlist, only list them.')
+
+ network = optparse.OptionGroup(parser, 'Network Options')
+ network.add_option(
+ '--proxy', dest='proxy',
+ default=None, metavar='URL',
+ help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
+ network.add_option(
+ '--socket-timeout',
+ dest='socket_timeout', type=float, default=None, metavar='SECONDS',
+ help='Time to wait before giving up, in seconds')
+ network.add_option(
+ '--source-address',
+ metavar='IP', dest='source_address', default=None,
+ help='Client-side IP address to bind to (experimental)',
+ )
+ network.add_option(
+ '-4', '--force-ipv4',
+ action='store_const', const='0.0.0.0', dest='source_address',
+ help='Make all connections via IPv4 (experimental)',
+ )
+ network.add_option(
+ '-6', '--force-ipv6',
+ action='store_const', const='::', dest='source_address',
+ help='Make all connections via IPv6 (experimental)',
+ )
selection = optparse.OptionGroup(parser, 'Video Selection')
selection.add_option(
@@ -212,7 +243,7 @@ def parseOpts(overrideArguments=None):
selection.add_option(
'--no-playlist',
action='store_true', dest='noplaylist', default=False,
- help='download only the currently playing video')
+ help='If the URL refers to a video and a playlist, download only the video.')
selection.add_option(
'--age-limit',
metavar='YEARS', dest='age_limit', default=None, type=int,
@@ -234,7 +265,7 @@ def parseOpts(overrideArguments=None):
authentication.add_option(
'-p', '--password',
dest='password', metavar='PASSWORD',
- help='account password')
+ help='account password. If this option is left out, youtube-dl will ask interactively.')
authentication.add_option(
'-2', '--twofactor',
dest='twofactor', metavar='TWOFACTOR',
@@ -252,7 +283,30 @@ def parseOpts(overrideArguments=None):
video_format.add_option(
'-f', '--format',
action='store', dest='format', metavar='FORMAT', default=None,
- help='video format code, specify the order of preference using slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality. Use commas to download multiple audio formats, such as -f 136/137/mp4/bestvideo,140/m4a/bestaudio')
+ help=(
+ 'video format code, specify the order of preference using'
+ ' slashes, as in -f 22/17/18 . '
+ ' Instead of format codes, you can select by extension for the '
+ 'extensions aac, m4a, mp3, mp4, ogg, wav, webm. '
+ 'You can also use the special names "best",'
+ ' "bestvideo", "bestaudio", "worst". '
+ ' You can filter the video results by putting a condition in'
+ ' brackets, as in -f "best[height=720]"'
+ ' (or -f "[filesize>10M]"). '
+ ' This works for filesize, height, width, tbr, abr, and vbr'
+ ' and the comparisons <, <=, >, >=, =, != .'
+ ' Formats for which the value is not known are excluded unless you'
+ ' put a question mark (?) after the operator.'
+ ' You can combine format filters, so '
+ '-f "[height <=? 720][tbr>500]" '
+ 'selects up to 720p videos (or videos where the height is not '
+ 'known) with a bitrate of at least 500 KBit/s.'
+ ' By default, youtube-dl will pick the best quality.'
+ ' Use commas to download multiple audio formats, such as'
+ ' -f 136/137/mp4/bestvideo,140/m4a/bestaudio.'
+ ' You can merge the video and audio of two formats into a single'
+ ' file using -f <video-format>+<audio-format> (requires ffmpeg or'
+ ' avconv), for example -f bestvideo+bestaudio.'))
video_format.add_option(
'--all-formats',
action='store_const', dest='format', const='all',
@@ -277,6 +331,12 @@ def parseOpts(overrideArguments=None):
'--youtube-skip-dash-manifest',
action='store_false', dest='youtube_include_dash_manifest',
help='Do not download the DASH manifest on YouTube videos')
+ video_format.add_option(
+ '--merge-output-format',
+ action='store', dest='merge_output_format', metavar='FORMAT', default=None,
+ help=(
+ 'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.'
+ 'Ignored if no merge is required'))
subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
subtitles.add_option(
@@ -326,6 +386,15 @@ def parseOpts(overrideArguments=None):
'--test',
action='store_true', dest='test', default=False,
help=optparse.SUPPRESS_HELP)
+ downloader.add_option(
+ '--playlist-reverse',
+ action='store_true',
+ help='Download playlist videos in reverse order')
+ downloader.add_option(
+ '--external-downloader',
+ dest='external_downloader', metavar='COMMAND',
+ help='(experimental) Use the specified external downloader. '
+ 'Currently supports %s' % ','.join(list_external_downloaders()))
workarounds = optparse.OptionGroup(parser, 'Workarounds')
workarounds.add_option(
@@ -358,6 +427,10 @@ def parseOpts(overrideArguments=None):
'--bidi-workaround',
dest='bidi_workaround', action='store_true',
help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
+ workarounds.add_option(
+ '--sleep-interval', metavar='SECONDS',
+ dest='sleep_interval', type=float,
+ help='Number of seconds to sleep before each download.')
verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
verbosity.add_option(
@@ -413,6 +486,15 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='dumpjson', default=False,
help='simulate, quiet but print JSON information. See --output for a description of available keys.')
verbosity.add_option(
+ '-J', '--dump-single-json',
+ action='store_true', dest='dump_single_json', default=False,
+ help='simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.')
+ verbosity.add_option(
+ '--print-json',
+ action='store_true', dest='print_json', default=False,
+ help='Be quiet and print the video information as JSON (video is still being downloaded).',
+ )
+ verbosity.add_option(
'--newline',
action='store_true', dest='progress_with_newline', default=False,
help='output progress bar as new lines')
@@ -444,6 +526,14 @@ def parseOpts(overrideArguments=None):
'--print-traffic',
dest='debug_printtraffic', action='store_true', default=False,
help='Display sent and read HTTP traffic')
+ verbosity.add_option(
+ '-C', '--call-home',
+ dest='call_home', action='store_true', default=False,
+ help='Contact the youtube-dl server for debugging.')
+ verbosity.add_option(
+ '--no-call-home',
+ dest='call_home', action='store_false', default=False,
+ help='Do NOT contact the youtube-dl server for debugging.')
filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
filesystem.add_option(
@@ -454,10 +544,6 @@ def parseOpts(overrideArguments=None):
'--id', default=False,
action='store_true', dest='useid', help='use only video ID in file name')
filesystem.add_option(
- '-A', '--auto-number',
- action='store_true', dest='autonumber', default=False,
- help='number downloaded files starting from 00000')
- filesystem.add_option(
'-o', '--output',
dest='outtmpl', metavar='TEMPLATE',
help=('output filename template. Use %(title)s to get the title, '
@@ -468,10 +554,12 @@ def parseOpts(overrideArguments=None):
'%(format_id)s for the unique id of the format (like Youtube\'s itags: "137"), '
'%(upload_date)s for the upload date (YYYYMMDD), '
'%(extractor)s for the provider (youtube, metacafe, etc), '
- '%(id)s for the video id, %(playlist)s for the playlist the video is in, '
- '%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+ '%(id)s for the video id, '
+ '%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, '
+ '%(playlist_index)s for the position in the playlist. '
'%(height)s and %(width)s for the width and height of the video format. '
'%(resolution)s for a textual description of the resolution of the video format. '
+ '%% for a literal percent. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
filesystem.add_option(
@@ -483,6 +571,10 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='restrictfilenames', default=False,
help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames')
filesystem.add_option(
+ '-A', '--auto-number',
+ action='store_true', dest='autonumber', default=False,
+ help='[deprecated; use -o "%(autonumber)s-%(title)s.%(ext)s" ] number downloaded files starting from 00000')
+ filesystem.add_option(
'-t', '--title',
action='store_true', dest='usetitle', default=False,
help='[deprecated] use title in file name (default)')
@@ -523,10 +615,6 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='writeannotations', default=False,
help='write video annotations to a .annotation file')
filesystem.add_option(
- '--write-thumbnail',
- action='store_true', dest='writethumbnail', default=False,
- help='write thumbnail image to disk')
- filesystem.add_option(
'--load-info',
dest='load_info_filename', metavar='FILE',
help='json file containing the video information (created with the "--write-json" option)')
@@ -545,6 +633,20 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='rm_cachedir',
help='Delete all filesystem cache files')
+ thumbnail = optparse.OptionGroup(parser, 'Thumbnail images')
+ thumbnail.add_option(
+ '--write-thumbnail',
+ action='store_true', dest='writethumbnail', default=False,
+ help='write thumbnail image to disk')
+ thumbnail.add_option(
+ '--write-all-thumbnails',
+ action='store_true', dest='write_all_thumbnails', default=False,
+ help='write all thumbnail image formats to disk')
+ thumbnail.add_option(
+ '--list-thumbnails',
+ action='store_true', dest='list_thumbnails', default=False,
+ help='Simulate and list all available thumbnail formats')
+
postproc = optparse.OptionGroup(parser, 'Post-processing Options')
postproc.add_option(
'-x', '--extract-audio',
@@ -586,6 +688,13 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='xattrs', default=False,
help='write metadata to the video file\'s xattrs (using dublin core and xdg standards)')
postproc.add_option(
+ '--fixup',
+ metavar='POLICY', dest='fixup', default='detect_or_warn',
+ help='(experimental) Automatically correct known faults of the file. '
+ 'One of never (do nothing), warn (only emit a warning), '
+ 'detect_or_warn(check whether we can do anything about it, warn '
+ 'otherwise')
+ postproc.add_option(
'--prefer-avconv',
action='store_false', dest='prefer_ffmpeg',
help='Prefer avconv over ffmpeg for running the postprocessors (default)')
@@ -596,12 +705,14 @@ def parseOpts(overrideArguments=None):
postproc.add_option(
'--exec',
metavar='CMD', dest='exec_cmd',
- help='Execute a command on the file after downloading, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'' )
+ help='Execute a command on the file after downloading, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'')
parser.add_option_group(general)
+ parser.add_option_group(network)
parser.add_option_group(selection)
parser.add_option_group(downloader)
parser.add_option_group(filesystem)
+ parser.add_option_group(thumbnail)
parser.add_option_group(verbosity)
parser.add_option_group(workarounds)
parser.add_option_group(video_format)
diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py
index 15aa0daa9..0ffbca258 100644
--- a/youtube_dl/postprocessor/__init__.py
+++ b/youtube_dl/postprocessor/__init__.py
@@ -1,24 +1,36 @@
+from __future__ import unicode_literals
from .atomicparsley import AtomicParsleyPP
from .ffmpeg import (
+ FFmpegPostProcessor,
FFmpegAudioFixPP,
+ FFmpegEmbedSubtitlePP,
+ FFmpegExtractAudioPP,
+ FFmpegFixupStretchedPP,
+ FFmpegFixupM4aPP,
FFmpegMergerPP,
FFmpegMetadataPP,
- FFmpegVideoConvertor,
- FFmpegExtractAudioPP,
- FFmpegEmbedSubtitlePP,
+ FFmpegVideoConvertorPP,
)
from .xattrpp import XAttrMetadataPP
from .execafterdownload import ExecAfterDownloadPP
+
+def get_postprocessor(key):
+ return globals()[key + 'PP']
+
+
__all__ = [
'AtomicParsleyPP',
+ 'ExecAfterDownloadPP',
'FFmpegAudioFixPP',
+ 'FFmpegEmbedSubtitlePP',
+ 'FFmpegExtractAudioPP',
+ 'FFmpegFixupM4aPP',
+ 'FFmpegFixupStretchedPP',
'FFmpegMergerPP',
'FFmpegMetadataPP',
- 'FFmpegVideoConvertor',
- 'FFmpegExtractAudioPP',
- 'FFmpegEmbedSubtitlePP',
+ 'FFmpegPostProcessor',
+ 'FFmpegVideoConvertorPP',
'XAttrMetadataPP',
- 'ExecAfterDownloadPP',
]
diff --git a/youtube_dl/postprocessor/atomicparsley.py b/youtube_dl/postprocessor/atomicparsley.py
index 765b2d9ee..448ccc5f3 100644
--- a/youtube_dl/postprocessor/atomicparsley.py
+++ b/youtube_dl/postprocessor/atomicparsley.py
@@ -6,10 +6,11 @@ import os
import subprocess
from .common import PostProcessor
-
+from ..compat import (
+ compat_urlretrieve,
+)
from ..utils import (
check_executable,
- compat_urlretrieve,
encodeFilename,
PostProcessingError,
prepend_extension,
diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py
index 788f94d02..e54ae678d 100644
--- a/youtube_dl/postprocessor/common.py
+++ b/youtube_dl/postprocessor/common.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
from ..utils import PostProcessingError
diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py
index 08419a3d4..75c0f7bbe 100644
--- a/youtube_dl/postprocessor/execafterdownload.py
+++ b/youtube_dl/postprocessor/execafterdownload.py
@@ -3,10 +3,8 @@ from __future__ import unicode_literals
import subprocess
from .common import PostProcessor
-from ..utils import (
- shlex_quote,
- PostProcessingError,
-)
+from ..compat import shlex_quote
+from ..utils import PostProcessingError
class ExecAfterDownloadPP(PostProcessor):
@@ -16,7 +14,7 @@ class ExecAfterDownloadPP(PostProcessor):
def run(self, information):
cmd = self.exec_cmd
- if not '{}' in cmd:
+ if '{}' not in cmd:
cmd += ' {}'
cmd = cmd.replace('{}', shlex_quote(information['filepath']))
@@ -28,4 +26,3 @@ class ExecAfterDownloadPP(PostProcessor):
'Command returned error code %d' % retCode)
return None, information # by default, keep file and do nothing
-
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 6f010a9c7..855d1e6db 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import os
import subprocess
import sys
@@ -6,11 +8,14 @@ import time
from .common import AudioConversionError, PostProcessor
-from ..utils import (
- check_executable,
+from ..compat import (
compat_subprocess_get_DEVNULL,
+)
+from ..utils import (
encodeArgument,
encodeFilename,
+ get_exe_version,
+ is_outdated_version,
PostProcessingError,
prepend_extension,
shell_quote,
@@ -25,42 +30,78 @@ class FFmpegPostProcessorError(PostProcessingError):
class FFmpegPostProcessor(PostProcessor):
def __init__(self, downloader=None, deletetempfiles=False):
PostProcessor.__init__(self, downloader)
- self._exes = self.detect_executables()
+ self._versions = self.get_versions()
self._deletetempfiles = deletetempfiles
+ def check_version(self):
+ if not self._executable:
+ raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.')
+
+ required_version = '10-0' if self._uses_avconv() else '1.0'
+ if is_outdated_version(
+ self._versions[self._executable], required_version):
+ warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % (
+ self._executable, self._executable, required_version)
+ if self._downloader:
+ self._downloader.report_warning(warning)
+
@staticmethod
- def detect_executables():
+ def get_versions():
programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe']
- return dict((program, check_executable(program, ['-version'])) for program in programs)
+ return dict((p, get_exe_version(p, args=['-version'])) for p in programs)
+
+ @property
+ def available(self):
+ return self._executable is not None
+
+ @property
+ def _executable(self):
+ if self._downloader.params.get('prefer_ffmpeg', False):
+ prefs = ('ffmpeg', 'avconv')
+ else:
+ prefs = ('avconv', 'ffmpeg')
+ for p in prefs:
+ if self._versions[p]:
+ return p
+ return None
- def _get_executable(self):
+ @property
+ def _probe_executable(self):
if self._downloader.params.get('prefer_ffmpeg', False):
- return self._exes['ffmpeg'] or self._exes['avconv']
+ prefs = ('ffprobe', 'avprobe')
else:
- return self._exes['avconv'] or self._exes['ffmpeg']
+ prefs = ('avprobe', 'ffprobe')
+ for p in prefs:
+ if self._versions[p]:
+ return p
+ return None
def _uses_avconv(self):
- return self._get_executable() == self._exes['avconv']
+ return self._executable == 'avconv'
def run_ffmpeg_multiple_files(self, input_paths, out_path, opts):
- if not self._get_executable():
- raise FFmpegPostProcessorError(u'ffmpeg or avconv not found. Please install one.')
+ self.check_version()
+
+ oldest_mtime = min(
+ os.stat(encodeFilename(path)).st_mtime for path in input_paths)
files_cmd = []
for path in input_paths:
- files_cmd.extend(['-i', encodeFilename(path, True)])
- cmd = ([self._get_executable(), '-y'] + files_cmd
- + [encodeArgument(o) for o in opts] +
+ files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)])
+ cmd = ([encodeFilename(self._executable, True), encodeArgument('-y')] +
+ files_cmd +
+ [encodeArgument(o) for o in opts] +
[encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
if self._downloader.params.get('verbose', False):
- self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
+ self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
if p.returncode != 0:
stderr = stderr.decode('utf-8', 'replace')
msg = stderr.strip().split('\n')[-1]
raise FFmpegPostProcessorError(msg)
+ os.utime(encodeFilename(out_path), (oldest_mtime, oldest_mtime))
if self._deletetempfiles:
for ipath in input_paths:
os.remove(ipath)
@@ -70,8 +111,8 @@ class FFmpegPostProcessor(PostProcessor):
def _ffmpeg_filename_argument(self, fn):
# ffmpeg broke --, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details
- if fn.startswith(u'-'):
- return u'./' + fn
+ if fn.startswith('-'):
+ return './' + fn
return fn
@@ -85,12 +126,13 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
self._nopostoverwrites = nopostoverwrites
def get_audio_codec(self, path):
- if not self._exes['ffprobe'] and not self._exes['avprobe']:
- raise PostProcessingError(u'ffprobe or avprobe not found. Please install one.')
+
+ if not self._probe_executable:
+ raise PostProcessingError('ffprobe or avprobe not found. Please install one.')
try:
cmd = [
- self._exes['avprobe'] or self._exes['ffprobe'],
- '-show_streams',
+ encodeFilename(self._probe_executable, True),
+ encodeArgument('-show_streams'),
encodeFilename(self._ffmpeg_filename_argument(path), True)]
handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE)
output = handle.communicate()[0]
@@ -122,7 +164,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
filecodec = self.get_audio_codec(path)
if filecodec is None:
- raise PostProcessingError(u'WARNING: unable to obtain file audio codec with ffprobe')
+ raise PostProcessingError('WARNING: unable to obtain file audio codec with ffprobe')
uses_avconv = self._uses_avconv()
more_opts = []
@@ -171,7 +213,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
extension = 'wav'
more_opts += ['-f', 'wav']
- prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
+ prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups
new_path = prefix + sep + extension
# If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly.
@@ -180,16 +222,16 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
try:
if self._nopostoverwrites and os.path.exists(encodeFilename(new_path)):
- self._downloader.to_screen(u'[youtube] Post-process file %s exists, skipping' % new_path)
+ self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path)
else:
- self._downloader.to_screen(u'[' + self._get_executable() + '] Destination: ' + new_path)
+ self._downloader.to_screen('[' + self._executable + '] Destination: ' + new_path)
self.run_ffmpeg(path, new_path, acodec, more_opts)
except:
- etype,e,tb = sys.exc_info()
+ etype, e, tb = sys.exc_info()
if isinstance(e, AudioConversionError):
- msg = u'audio conversion failed: ' + e.msg
+ msg = 'audio conversion failed: ' + e.msg
else:
- msg = u'error running ' + self._get_executable()
+ msg = 'error running ' + self._executable
raise PostProcessingError(msg)
# Try to update the date time for extracted audio file.
@@ -197,30 +239,30 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
try:
os.utime(encodeFilename(new_path), (time.time(), information['filetime']))
except:
- self._downloader.report_warning(u'Cannot update utime of audio file')
+ self._downloader.report_warning('Cannot update utime of audio file')
information['filepath'] = new_path
- return self._nopostoverwrites,information
+ return self._nopostoverwrites, information
-class FFmpegVideoConvertor(FFmpegPostProcessor):
- def __init__(self, downloader=None,preferedformat=None):
- super(FFmpegVideoConvertor, self).__init__(downloader)
- self._preferedformat=preferedformat
+class FFmpegVideoConvertorPP(FFmpegPostProcessor):
+ def __init__(self, downloader=None, preferedformat=None):
+ super(FFmpegVideoConvertorPP, self).__init__(downloader)
+ self._preferedformat = preferedformat
def run(self, information):
path = information['filepath']
- prefix, sep, ext = path.rpartition(u'.')
+ prefix, sep, ext = path.rpartition('.')
outpath = prefix + sep + self._preferedformat
if information['ext'] == self._preferedformat:
- self._downloader.to_screen(u'[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat))
- return True,information
- self._downloader.to_screen(u'['+'ffmpeg'+'] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) +outpath)
+ self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat))
+ return True, information
+ self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath)
self.run_ffmpeg(path, outpath, [])
information['filepath'] = outpath
information['format'] = self._preferedformat
information['ext'] = self._preferedformat
- return False,information
+ return False, information
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
@@ -422,27 +464,33 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
return cls._lang_map.get(code[:2])
def run(self, information):
- if information['ext'] != u'mp4':
- self._downloader.to_screen(u'[ffmpeg] Subtitles can only be embedded in mp4 files')
+ if information['ext'] != 'mp4':
+ self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 files')
return True, information
if not information.get('subtitles'):
- self._downloader.to_screen(u'[ffmpeg] There aren\'t any subtitles to embed')
+ self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed')
return True, information
sub_langs = [key for key in information['subtitles']]
filename = information['filepath']
input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs]
- opts = ['-map', '0:0', '-map', '0:1', '-c:v', 'copy', '-c:a', 'copy']
+ opts = [
+ '-map', '0',
+ '-c', 'copy',
+ # Don't copy the existing subtitles, we may be running the
+ # postprocessor a second time
+ '-map', '-0:s',
+ '-c:s', 'mov_text',
+ ]
for (i, lang) in enumerate(sub_langs):
- opts.extend(['-map', '%d:0' % (i+1), '-c:s:%d' % i, 'mov_text'])
+ opts.extend(['-map', '%d:0' % (i + 1)])
lang_code = self._conver_lang_code(lang)
if lang_code is not None:
opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
- opts.extend(['-f', 'mp4'])
- temp_filename = filename + u'.temp'
- self._downloader.to_screen(u'[ffmpeg] Embedding subtitles in \'%s\'' % filename)
+ temp_filename = prepend_extension(filename, 'temp')
+ self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename)
self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
@@ -461,15 +509,19 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
metadata['artist'] = info['uploader']
elif info.get('uploader_id') is not None:
metadata['artist'] = info['uploader_id']
+ if info.get('description') is not None:
+ metadata['description'] = info['description']
+ if info.get('webpage_url') is not None:
+ metadata['comment'] = info['webpage_url']
if not metadata:
- self._downloader.to_screen(u'[ffmpeg] There isn\'t any metadata to add')
+ self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')
return True, info
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
- if info['ext'] == u'm4a':
+ if info['ext'] == 'm4a':
options = ['-vn', '-acodec', 'copy']
else:
options = ['-c', 'copy']
@@ -477,7 +529,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
- self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename)
+ self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
self.run_ffmpeg(filename, temp_filename, options)
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
@@ -487,8 +539,8 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
class FFmpegMergerPP(FFmpegPostProcessor):
def run(self, info):
filename = info['filepath']
- args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0', '-shortest']
- self._downloader.to_screen(u'[ffmpeg] Merging formats into "%s"' % filename)
+ args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0']
+ self._downloader.to_screen('[ffmpeg] Merging formats into "%s"' % filename)
self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)
return True, info
@@ -499,7 +551,44 @@ class FFmpegAudioFixPP(FFmpegPostProcessor):
temp_filename = prepend_extension(filename, 'temp')
options = ['-vn', '-acodec', 'copy']
- self._downloader.to_screen(u'[ffmpeg] Fixing audio file "%s"' % filename)
+ self._downloader.to_screen('[ffmpeg] Fixing audio file "%s"' % filename)
+ self.run_ffmpeg(filename, temp_filename, options)
+
+ os.remove(encodeFilename(filename))
+ os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+ return True, info
+
+
+class FFmpegFixupStretchedPP(FFmpegPostProcessor):
+ def run(self, info):
+ stretched_ratio = info.get('stretched_ratio')
+ if stretched_ratio is None or stretched_ratio == 1:
+ return True, info
+
+ filename = info['filepath']
+ temp_filename = prepend_extension(filename, 'temp')
+
+ options = ['-c', 'copy', '-aspect', '%f' % stretched_ratio]
+ self._downloader.to_screen('[ffmpeg] Fixing aspect ratio in "%s"' % filename)
+ self.run_ffmpeg(filename, temp_filename, options)
+
+ os.remove(encodeFilename(filename))
+ os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+ return True, info
+
+
+class FFmpegFixupM4aPP(FFmpegPostProcessor):
+ def run(self, info):
+ if info.get('container') != 'm4a_dash':
+ return True, info
+
+ filename = info['filepath']
+ temp_filename = prepend_extension(filename, 'temp')
+
+ options = ['-c', 'copy', '-f', 'mp4']
+ self._downloader.to_screen('[ffmpeg] Correcting container in "%s"' % filename)
self.run_ffmpeg(filename, temp_filename, options)
os.remove(encodeFilename(filename))
diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py
index f6940940b..f6c63fe97 100644
--- a/youtube_dl/postprocessor/xattrpp.py
+++ b/youtube_dl/postprocessor/xattrpp.py
@@ -1,12 +1,16 @@
+from __future__ import unicode_literals
+
import os
import subprocess
import sys
from .common import PostProcessor
+from ..compat import (
+ subprocess_check_output
+)
from ..utils import (
check_executable,
hyphenate_date,
- subprocess_check_output
)
@@ -106,4 +110,3 @@ class XAttrMetadataPP(PostProcessor):
except (subprocess.CalledProcessError, OSError):
self._downloader.report_error("This filesystem doesn't support extended attributes. (You may have to enable them in your /etc/fstab)")
return False, info
-
diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py
index b63c65b20..e60505ace 100644
--- a/youtube_dl/swfinterp.py
+++ b/youtube_dl/swfinterp.py
@@ -4,8 +4,8 @@ import collections
import io
import zlib
+from .compat import compat_str
from .utils import (
- compat_str,
ExtractorError,
struct_unpack,
)
@@ -62,15 +62,17 @@ class _ScopeDict(dict):
class _AVMClass(object):
- def __init__(self, name_idx, name):
+ def __init__(self, name_idx, name, static_properties=None):
self.name_idx = name_idx
self.name = name
self.method_names = {}
self.method_idxs = {}
self.methods = {}
self.method_pyfunctions = {}
+ self.static_properties = static_properties if static_properties else {}
self.variables = _ScopeDict(self)
+ self.constants = {}
def make_object(self):
return _AVMClass_Object(self)
@@ -148,8 +150,38 @@ def _read_byte(reader):
return res
+StringClass = _AVMClass('(no name idx)', 'String')
+ByteArrayClass = _AVMClass('(no name idx)', 'ByteArray')
+TimerClass = _AVMClass('(no name idx)', 'Timer')
+TimerEventClass = _AVMClass('(no name idx)', 'TimerEvent', {'TIMER': 'timer'})
+_builtin_classes = {
+ StringClass.name: StringClass,
+ ByteArrayClass.name: ByteArrayClass,
+ TimerClass.name: TimerClass,
+ TimerEventClass.name: TimerEventClass,
+}
+
+
+class _Undefined(object):
+ def __bool__(self):
+ return False
+ __nonzero__ = __bool__
+
+ def __hash__(self):
+ return 0
+
+ def __str__(self):
+ return 'undefined'
+ __repr__ = __str__
+
+undefined = _Undefined()
+
+
class SWFInterpreter(object):
def __init__(self, file_contents):
+ self._patched_functions = {
+ (TimerClass, 'addEventListener'): lambda params: undefined,
+ }
code_tag = next(tag
for tag_code, tag in _extract_tags(file_contents)
if tag_code == 82)
@@ -170,11 +202,13 @@ class SWFInterpreter(object):
# Constant pool
int_count = u30()
+ self.constant_ints = [0]
for _c in range(1, int_count):
- s32()
+ self.constant_ints.append(s32())
+ self.constant_uints = [0]
uint_count = u30()
for _c in range(1, uint_count):
- u32()
+ self.constant_uints.append(u32())
double_count = u30()
read_bytes(max(0, (double_count - 1)) * 8)
string_count = u30()
@@ -212,6 +246,10 @@ class SWFInterpreter(object):
u30() # namespace_idx
name_idx = u30()
self.multinames.append(self.constant_strings[name_idx])
+ elif kind == 0x09:
+ name_idx = u30()
+ u30()
+ self.multinames.append(self.constant_strings[name_idx])
else:
self.multinames.append(_Multiname(kind))
for _c2 in range(MULTINAME_SIZES[kind]):
@@ -258,13 +296,28 @@ class SWFInterpreter(object):
kind = kind_full & 0x0f
attrs = kind_full >> 4
methods = {}
- if kind in [0x00, 0x06]: # Slot or Const
+ constants = None
+ if kind == 0x00: # Slot
u30() # Slot id
u30() # type_name_idx
vindex = u30()
if vindex != 0:
read_byte() # vkind
- elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
+ elif kind == 0x06: # Const
+ u30() # Slot id
+ u30() # type_name_idx
+ vindex = u30()
+ vkind = 'any'
+ if vindex != 0:
+ vkind = read_byte()
+ if vkind == 0x03: # Constant_Int
+ value = self.constant_ints[vindex]
+ elif vkind == 0x04: # Constant_UInt
+ value = self.constant_uints[vindex]
+ else:
+ return {}, None # Ignore silently for now
+ constants = {self.multinames[trait_name_idx]: value}
+ elif kind in (0x01, 0x02, 0x03): # Method / Getter / Setter
u30() # disp_id
method_idx = u30()
methods[self.multinames[trait_name_idx]] = method_idx
@@ -283,7 +336,7 @@ class SWFInterpreter(object):
for _c3 in range(metadata_count):
u30() # metadata index
- return methods
+ return methods, constants
# Classes
class_count = u30()
@@ -305,18 +358,22 @@ class SWFInterpreter(object):
u30() # iinit
trait_count = u30()
for _c2 in range(trait_count):
- trait_methods = parse_traits_info()
+ trait_methods, trait_constants = parse_traits_info()
avm_class.register_methods(trait_methods)
+ if trait_constants:
+ avm_class.constants.update(trait_constants)
assert len(classes) == class_count
self._classes_by_name = dict((c.name, c) for c in classes)
for avm_class in classes:
- u30() # cinit
+ avm_class.cinit_idx = u30()
trait_count = u30()
for _c2 in range(trait_count):
- trait_methods = parse_traits_info()
+ trait_methods, trait_constants = parse_traits_info()
avm_class.register_methods(trait_methods)
+ if trait_constants:
+ avm_class.constants.update(trait_constants)
# Scripts
script_count = u30()
@@ -329,6 +386,7 @@ class SWFInterpreter(object):
# Method bodies
method_body_count = u30()
Method = collections.namedtuple('Method', ['code', 'local_count'])
+ self._all_methods = []
for _c in range(method_body_count):
method_idx = u30()
u30() # max_stack
@@ -337,9 +395,10 @@ class SWFInterpreter(object):
u30() # max_scope_depth
code_length = u30()
code = read_bytes(code_length)
+ m = Method(code, local_count)
+ self._all_methods.append(m)
for avm_class in classes:
if method_idx in avm_class.method_idxs:
- m = Method(code, local_count)
avm_class.methods[avm_class.method_idxs[method_idx]] = m
exception_count = u30()
for _c2 in range(exception_count):
@@ -354,13 +413,27 @@ class SWFInterpreter(object):
assert p + code_reader.tell() == len(code_tag)
- def extract_class(self, class_name):
+ def patch_function(self, avm_class, func_name, f):
+ self._patched_functions[(avm_class, func_name)] = f
+
+ def extract_class(self, class_name, call_cinit=True):
try:
- return self._classes_by_name[class_name]
+ res = self._classes_by_name[class_name]
except KeyError:
raise ExtractorError('Class %r not found' % class_name)
+ if call_cinit and hasattr(res, 'cinit_idx'):
+ res.register_methods({'$cinit': res.cinit_idx})
+ res.methods['$cinit'] = self._all_methods[res.cinit_idx]
+ cinit = self.extract_function(res, '$cinit')
+ cinit([])
+
+ return res
+
def extract_function(self, avm_class, func_name):
+ p = self._patched_functions.get((avm_class, func_name))
+ if p:
+ return p
if func_name in avm_class.method_pyfunctions:
return avm_class.method_pyfunctions[func_name]
if func_name in self._classes_by_name:
@@ -379,10 +452,15 @@ class SWFInterpreter(object):
registers = [avm_class.variables] + list(args) + [None] * m.local_count
stack = []
scopes = collections.deque([
- self._classes_by_name, avm_class.variables])
+ self._classes_by_name, avm_class.constants, avm_class.variables])
while True:
opcode = _read_byte(coder)
- if opcode == 17: # iftrue
+ if opcode == 9: # label
+ pass # Spec says: "Do nothing."
+ elif opcode == 16: # jump
+ offset = s24()
+ coder.seek(coder.tell() + offset)
+ elif opcode == 17: # iftrue
offset = s24()
value = stack.pop()
if value:
@@ -392,9 +470,40 @@ class SWFInterpreter(object):
value = stack.pop()
if not value:
coder.seek(coder.tell() + offset)
+ elif opcode == 19: # ifeq
+ offset = s24()
+ value2 = stack.pop()
+ value1 = stack.pop()
+ if value2 == value1:
+ coder.seek(coder.tell() + offset)
+ elif opcode == 20: # ifne
+ offset = s24()
+ value2 = stack.pop()
+ value1 = stack.pop()
+ if value2 != value1:
+ coder.seek(coder.tell() + offset)
+ elif opcode == 21: # iflt
+ offset = s24()
+ value2 = stack.pop()
+ value1 = stack.pop()
+ if value1 < value2:
+ coder.seek(coder.tell() + offset)
+ elif opcode == 32: # pushnull
+ stack.append(None)
+ elif opcode == 33: # pushundefined
+ stack.append(undefined)
elif opcode == 36: # pushbyte
v = _read_byte(coder)
stack.append(v)
+ elif opcode == 37: # pushshort
+ v = u30()
+ stack.append(v)
+ elif opcode == 38: # pushtrue
+ stack.append(True)
+ elif opcode == 39: # pushfalse
+ stack.append(False)
+ elif opcode == 40: # pushnan
+ stack.append(float('NaN'))
elif opcode == 42: # dup
value = stack[-1]
stack.append(value)
@@ -419,11 +528,31 @@ class SWFInterpreter(object):
[stack.pop() for _ in range(arg_count)]))
obj = stack.pop()
- if isinstance(obj, _AVMClass_Object):
+ if obj == StringClass:
+ if mname == 'String':
+ assert len(args) == 1
+ assert isinstance(args[0], (
+ int, compat_str, _Undefined))
+ if args[0] == undefined:
+ res = 'undefined'
+ else:
+ res = compat_str(args[0])
+ stack.append(res)
+ continue
+ else:
+ raise NotImplementedError(
+ 'Function String.%s is not yet implemented'
+ % mname)
+ elif isinstance(obj, _AVMClass_Object):
func = self.extract_function(obj.avm_class, mname)
res = func(args)
stack.append(res)
continue
+ elif isinstance(obj, _AVMClass):
+ func = self.extract_function(obj, mname)
+ res = func(args)
+ stack.append(res)
+ continue
elif isinstance(obj, _ScopeDict):
if mname in obj.avm_class.method_names:
func = self.extract_function(obj.avm_class, mname)
@@ -442,6 +571,13 @@ class SWFInterpreter(object):
res = obj.split(args[0])
stack.append(res)
continue
+ elif mname == 'charCodeAt':
+ assert len(args) <= 1
+ idx = 0 if len(args) == 0 else args[0]
+ assert isinstance(idx, int)
+ res = ord(obj[idx])
+ stack.append(res)
+ continue
elif isinstance(obj, list):
if mname == 'slice':
assert len(args) == 1
@@ -458,9 +594,18 @@ class SWFInterpreter(object):
raise NotImplementedError(
'Unsupported property %r on %r'
% (mname, obj))
+ elif opcode == 71: # returnvoid
+ res = undefined
+ return res
elif opcode == 72: # returnvalue
res = stack.pop()
return res
+ elif opcode == 73: # constructsuper
+ # Not yet implemented, just hope it works without it
+ arg_count = u30()
+ args = list(reversed(
+ [stack.pop() for _ in range(arg_count)]))
+ obj = stack.pop()
elif opcode == 74: # constructproperty
index = u30()
arg_count = u30()
@@ -481,6 +626,17 @@ class SWFInterpreter(object):
args = list(reversed(
[stack.pop() for _ in range(arg_count)]))
obj = stack.pop()
+ if isinstance(obj, _AVMClass_Object):
+ func = self.extract_function(obj.avm_class, mname)
+ res = func(args)
+ assert res is undefined
+ continue
+ if isinstance(obj, _ScopeDict):
+ assert mname in obj.avm_class.method_names
+ func = self.extract_function(obj.avm_class, mname)
+ res = func(args)
+ assert res is undefined
+ continue
if mname == 'reverse':
assert isinstance(obj, list)
obj.reverse()
@@ -504,7 +660,10 @@ class SWFInterpreter(object):
break
else:
res = scopes[0]
- stack.append(res[mname])
+ if mname not in res and mname in _builtin_classes:
+ stack.append(_builtin_classes[mname])
+ else:
+ stack.append(res[mname])
elif opcode == 94: # findproperty
index = u30()
mname = self.multinames[index]
@@ -524,9 +683,15 @@ class SWFInterpreter(object):
break
else:
scope = avm_class.variables
- # I cannot find where static variables are initialized
- # so let's just return None
- res = scope.get(mname)
+
+ if mname in scope:
+ res = scope[mname]
+ elif mname in _builtin_classes:
+ res = _builtin_classes[mname]
+ else:
+ # Assume unitialized
+ # TODO warn here
+ res = undefined
stack.append(res)
elif opcode == 97: # setproperty
index = u30()
@@ -548,22 +713,57 @@ class SWFInterpreter(object):
pname = self.multinames[index]
if pname == 'length':
obj = stack.pop()
- assert isinstance(obj, list)
+ assert isinstance(obj, (compat_str, list))
stack.append(len(obj))
+ elif isinstance(pname, compat_str): # Member access
+ obj = stack.pop()
+ if isinstance(obj, _AVMClass):
+ res = obj.static_properties[pname]
+ stack.append(res)
+ continue
+
+ assert isinstance(obj, (dict, _ScopeDict)),\
+ 'Accessing member %r on %r' % (pname, obj)
+ res = obj.get(pname, undefined)
+ stack.append(res)
else: # Assume attribute access
idx = stack.pop()
assert isinstance(idx, int)
obj = stack.pop()
assert isinstance(obj, list)
stack.append(obj[idx])
+ elif opcode == 104: # initproperty
+ index = u30()
+ value = stack.pop()
+ idx = self.multinames[index]
+ if isinstance(idx, _Multiname):
+ idx = stack.pop()
+ obj = stack.pop()
+ obj[idx] = value
elif opcode == 115: # convert_
value = stack.pop()
intvalue = int(value)
stack.append(intvalue)
elif opcode == 128: # coerce
u30()
+ elif opcode == 130: # coerce_a
+ value = stack.pop()
+ # um, yes, it's any value
+ stack.append(value)
elif opcode == 133: # coerce_s
assert isinstance(stack[-1], (type(None), compat_str))
+ elif opcode == 147: # decrement
+ value = stack.pop()
+ assert isinstance(value, int)
+ stack.append(value - 1)
+ elif opcode == 149: # typeof
+ value = stack.pop()
+ return {
+ _Undefined: 'undefined',
+ compat_str: 'String',
+ int: 'Number',
+ float: 'Number',
+ }[type(value)]
elif opcode == 160: # add
value2 = stack.pop()
value1 = stack.pop()
@@ -574,16 +774,37 @@ class SWFInterpreter(object):
value1 = stack.pop()
res = value1 - value2
stack.append(res)
+ elif opcode == 162: # multiply
+ value2 = stack.pop()
+ value1 = stack.pop()
+ res = value1 * value2
+ stack.append(res)
elif opcode == 164: # modulo
value2 = stack.pop()
value1 = stack.pop()
res = value1 % value2
stack.append(res)
+ elif opcode == 168: # bitand
+ value2 = stack.pop()
+ value1 = stack.pop()
+ assert isinstance(value1, int)
+ assert isinstance(value2, int)
+ res = value1 & value2
+ stack.append(res)
+ elif opcode == 171: # equals
+ value2 = stack.pop()
+ value1 = stack.pop()
+ result = value1 == value2
+ stack.append(result)
elif opcode == 175: # greaterequals
value2 = stack.pop()
value1 = stack.pop()
result = value1 >= value2
stack.append(result)
+ elif opcode == 192: # increment_i
+ value = stack.pop()
+ assert isinstance(value, int)
+ stack.append(value + 1)
elif opcode == 208: # getlocal_0
stack.append(registers[0])
elif opcode == 209: # getlocal_1
@@ -606,4 +827,3 @@ class SWFInterpreter(object):
avm_class.method_pyfunctions[func_name] = resfunc
return resfunc
-
diff --git a/youtube_dl/update.py b/youtube_dl/update.py
index 273083761..d8be4049f 100644
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import io
import json
import traceback
@@ -7,20 +9,19 @@ import subprocess
import sys
from zipimport import zipimporter
-from .utils import (
+from .compat import (
compat_str,
compat_urllib_request,
)
+from .utils import make_HTTPS_handler
from .version import __version__
+
def rsa_verify(message, signature, key):
from struct import pack
from hashlib import sha256
- from sys import version_info
- def b(x):
- if version_info[0] == 2: return x
- else: return x.encode('latin1')
- assert(type(message) == type(b('')))
+
+ assert isinstance(message, bytes)
block_size = 0
n = key[0]
while n:
@@ -31,14 +32,18 @@ def rsa_verify(message, signature, key):
while signature:
raw_bytes.insert(0, pack("B", signature & 0xFF))
signature >>= 8
- signature = (block_size - len(raw_bytes)) * b('\x00') + b('').join(raw_bytes)
- if signature[0:2] != b('\x00\x01'): return False
+ signature = (block_size - len(raw_bytes)) * b'\x00' + b''.join(raw_bytes)
+ if signature[0:2] != b'\x00\x01':
+ return False
signature = signature[2:]
- if not b('\x00') in signature: return False
- signature = signature[signature.index(b('\x00'))+1:]
- if not signature.startswith(b('\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20')): return False
+ if b'\x00' not in signature:
+ return False
+ signature = signature[signature.index(b'\x00') + 1:]
+ if not signature.startswith(b'\x30\x31\x30\x0D\x06\x09\x60\x86\x48\x01\x65\x03\x04\x02\x01\x05\x00\x04\x20'):
+ return False
signature = signature[19:]
- if signature != sha256(message).digest(): return False
+ if signature != sha256(message).digest():
+ return False
return True
@@ -51,35 +56,40 @@ def update_self(to_screen, verbose):
UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, "frozen"):
- to_screen(u'It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.')
+ to_screen('It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.')
return
+ https_handler = make_HTTPS_handler({})
+ opener = compat_urllib_request.build_opener(https_handler)
+
# Check if there is a new version
try:
- newversion = compat_urllib_request.urlopen(VERSION_URL).read().decode('utf-8').strip()
+ newversion = opener.open(VERSION_URL).read().decode('utf-8').strip()
except:
- if verbose: to_screen(compat_str(traceback.format_exc()))
- to_screen(u'ERROR: can\'t find the current version. Please try again later.')
+ if verbose:
+ to_screen(compat_str(traceback.format_exc()))
+ to_screen('ERROR: can\'t find the current version. Please try again later.')
return
if newversion == __version__:
- to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
+ to_screen('youtube-dl is up-to-date (' + __version__ + ')')
return
# Download and check versions info
try:
- versions_info = compat_urllib_request.urlopen(JSON_URL).read().decode('utf-8')
+ versions_info = opener.open(JSON_URL).read().decode('utf-8')
versions_info = json.loads(versions_info)
except:
- if verbose: to_screen(compat_str(traceback.format_exc()))
- to_screen(u'ERROR: can\'t obtain versions info. Please try again later.')
+ if verbose:
+ to_screen(compat_str(traceback.format_exc()))
+ to_screen('ERROR: can\'t obtain versions info. Please try again later.')
return
- if not 'signature' in versions_info:
- to_screen(u'ERROR: the versions file is not signed or corrupted. Aborting.')
+ if 'signature' not in versions_info:
+ to_screen('ERROR: the versions file is not signed or corrupted. Aborting.')
return
signature = versions_info['signature']
del versions_info['signature']
if not rsa_verify(json.dumps(versions_info, sort_keys=True).encode('utf-8'), signature, UPDATES_RSA_KEY):
- to_screen(u'ERROR: the versions file signature is invalid. Aborting.')
+ to_screen('ERROR: the versions file signature is invalid. Aborting.')
return
version_id = versions_info['latest']
@@ -87,10 +97,10 @@ def update_self(to_screen, verbose):
def version_tuple(version_str):
return tuple(map(int, version_str.split('.')))
if version_tuple(__version__) >= version_tuple(version_id):
- to_screen(u'youtube-dl is up to date (%s)' % __version__)
+ to_screen('youtube-dl is up to date (%s)' % __version__)
return
- to_screen(u'Updating to version ' + version_id + ' ...')
+ to_screen('Updating to version ' + version_id + ' ...')
version = versions_info['versions'][version_id]
print_notes(to_screen, versions_info['versions'])
@@ -98,11 +108,11 @@ def update_self(to_screen, verbose):
filename = sys.argv[0]
# Py2EXE: Filename could be different
if hasattr(sys, "frozen") and not os.path.isfile(filename):
- if os.path.isfile(filename + u'.exe'):
- filename += u'.exe'
+ if os.path.isfile(filename + '.exe'):
+ filename += '.exe'
if not os.access(filename, os.W_OK):
- to_screen(u'ERROR: no write permissions on %s' % filename)
+ to_screen('ERROR: no write permissions on %s' % filename)
return
# Py2EXE
@@ -110,86 +120,93 @@ def update_self(to_screen, verbose):
exe = os.path.abspath(filename)
directory = os.path.dirname(exe)
if not os.access(directory, os.W_OK):
- to_screen(u'ERROR: no write permissions on %s' % directory)
+ to_screen('ERROR: no write permissions on %s' % directory)
return
try:
- urlh = compat_urllib_request.urlopen(version['exe'][0])
+ urlh = opener.open(version['exe'][0])
newcontent = urlh.read()
urlh.close()
except (IOError, OSError):
- if verbose: to_screen(compat_str(traceback.format_exc()))
- to_screen(u'ERROR: unable to download latest version')
+ if verbose:
+ to_screen(compat_str(traceback.format_exc()))
+ to_screen('ERROR: unable to download latest version')
return
newcontent_hash = hashlib.sha256(newcontent).hexdigest()
if newcontent_hash != version['exe'][1]:
- to_screen(u'ERROR: the downloaded file hash does not match. Aborting.')
+ to_screen('ERROR: the downloaded file hash does not match. Aborting.')
return
try:
with open(exe + '.new', 'wb') as outf:
outf.write(newcontent)
except (IOError, OSError):
- if verbose: to_screen(compat_str(traceback.format_exc()))
- to_screen(u'ERROR: unable to write the new version')
+ if verbose:
+ to_screen(compat_str(traceback.format_exc()))
+ to_screen('ERROR: unable to write the new version')
return
try:
bat = os.path.join(directory, 'youtube-dl-updater.bat')
with io.open(bat, 'w') as batfile:
- batfile.write(u"""
+ batfile.write('''
@echo off
echo Waiting for file handle to be closed ...
ping 127.0.0.1 -n 5 -w 1000 > NUL
move /Y "%s.new" "%s" > NUL
echo Updated youtube-dl to version %s.
start /b "" cmd /c del "%%~f0"&exit /b"
- \n""" % (exe, exe, version_id))
+ \n''' % (exe, exe, version_id))
subprocess.Popen([bat]) # Continues to run in the background
return # Do not show premature success messages
except (IOError, OSError):
- if verbose: to_screen(compat_str(traceback.format_exc()))
- to_screen(u'ERROR: unable to overwrite current version')
+ if verbose:
+ to_screen(compat_str(traceback.format_exc()))
+ to_screen('ERROR: unable to overwrite current version')
return
# Zip unix package
elif isinstance(globals().get('__loader__'), zipimporter):
try:
- urlh = compat_urllib_request.urlopen(version['bin'][0])
+ urlh = opener.open(version['bin'][0])
newcontent = urlh.read()
urlh.close()
except (IOError, OSError):
- if verbose: to_screen(compat_str(traceback.format_exc()))
- to_screen(u'ERROR: unable to download latest version')
+ if verbose:
+ to_screen(compat_str(traceback.format_exc()))
+ to_screen('ERROR: unable to download latest version')
return
newcontent_hash = hashlib.sha256(newcontent).hexdigest()
if newcontent_hash != version['bin'][1]:
- to_screen(u'ERROR: the downloaded file hash does not match. Aborting.')
+ to_screen('ERROR: the downloaded file hash does not match. Aborting.')
return
try:
with open(filename, 'wb') as outf:
outf.write(newcontent)
except (IOError, OSError):
- if verbose: to_screen(compat_str(traceback.format_exc()))
- to_screen(u'ERROR: unable to overwrite current version')
+ if verbose:
+ to_screen(compat_str(traceback.format_exc()))
+ to_screen('ERROR: unable to overwrite current version')
return
- to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
+ to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
+
def get_notes(versions, fromVersion):
notes = []
- for v,vdata in sorted(versions.items()):
+ for v, vdata in sorted(versions.items()):
if v > fromVersion:
notes.extend(vdata.get('notes', []))
return notes
+
def print_notes(to_screen, versions, fromVersion=__version__):
notes = get_notes(versions, fromVersion)
if notes:
- to_screen(u'PLEASE NOTE:')
+ to_screen('PLEASE NOTE:')
for note in notes:
to_screen(note)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index d7ae5a90a..b8c52af74 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
import calendar
import codecs
import contextlib
@@ -8,7 +10,7 @@ import ctypes
import datetime
import email.utils
import errno
-import getpass
+import functools
import gzip
import itertools
import io
@@ -29,179 +31,22 @@ import traceback
import xml.etree.ElementTree
import zlib
-try:
- import urllib.request as compat_urllib_request
-except ImportError: # Python 2
- import urllib2 as compat_urllib_request
-
-try:
- import urllib.error as compat_urllib_error
-except ImportError: # Python 2
- import urllib2 as compat_urllib_error
-
-try:
- import urllib.parse as compat_urllib_parse
-except ImportError: # Python 2
- import urllib as compat_urllib_parse
-
-try:
- from urllib.parse import urlparse as compat_urllib_parse_urlparse
-except ImportError: # Python 2
- from urlparse import urlparse as compat_urllib_parse_urlparse
-
-try:
- import urllib.parse as compat_urlparse
-except ImportError: # Python 2
- import urlparse as compat_urlparse
+from .compat import (
+ compat_chr,
+ compat_getenv,
+ compat_html_entities,
+ compat_http_client,
+ compat_parse_qs,
+ compat_socket_create_connection,
+ compat_str,
+ compat_urllib_error,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+ compat_urllib_request,
+ compat_urlparse,
+ shlex_quote,
+)
-try:
- import http.cookiejar as compat_cookiejar
-except ImportError: # Python 2
- import cookielib as compat_cookiejar
-
-try:
- import html.entities as compat_html_entities
-except ImportError: # Python 2
- import htmlentitydefs as compat_html_entities
-
-try:
- import html.parser as compat_html_parser
-except ImportError: # Python 2
- import HTMLParser as compat_html_parser
-
-try:
- import http.client as compat_http_client
-except ImportError: # Python 2
- import httplib as compat_http_client
-
-try:
- from urllib.error import HTTPError as compat_HTTPError
-except ImportError: # Python 2
- from urllib2 import HTTPError as compat_HTTPError
-
-try:
- from urllib.request import urlretrieve as compat_urlretrieve
-except ImportError: # Python 2
- from urllib import urlretrieve as compat_urlretrieve
-
-
-try:
- from subprocess import DEVNULL
- compat_subprocess_get_DEVNULL = lambda: DEVNULL
-except ImportError:
- compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
-
-try:
- from urllib.parse import unquote as compat_urllib_parse_unquote
-except ImportError:
- def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
- if string == '':
- return string
- res = string.split('%')
- if len(res) == 1:
- return string
- if encoding is None:
- encoding = 'utf-8'
- if errors is None:
- errors = 'replace'
- # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
- pct_sequence = b''
- string = res[0]
- for item in res[1:]:
- try:
- if not item:
- raise ValueError
- pct_sequence += item[:2].decode('hex')
- rest = item[2:]
- if not rest:
- # This segment was just a single percent-encoded character.
- # May be part of a sequence of code units, so delay decoding.
- # (Stored in pct_sequence).
- continue
- except ValueError:
- rest = '%' + item
- # Encountered non-percent-encoded characters. Flush the current
- # pct_sequence.
- string += pct_sequence.decode(encoding, errors) + rest
- pct_sequence = b''
- if pct_sequence:
- # Flush the final pct_sequence
- string += pct_sequence.decode(encoding, errors)
- return string
-
-
-try:
- from urllib.parse import parse_qs as compat_parse_qs
-except ImportError: # Python 2
- # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
- # Python 2's version is apparently totally broken
-
- def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
- encoding='utf-8', errors='replace'):
- qs, _coerce_result = qs, unicode
- pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
- r = []
- for name_value in pairs:
- if not name_value and not strict_parsing:
- continue
- nv = name_value.split('=', 1)
- if len(nv) != 2:
- if strict_parsing:
- raise ValueError("bad query field: %r" % (name_value,))
- # Handle case of a control-name with no equal sign
- if keep_blank_values:
- nv.append('')
- else:
- continue
- if len(nv[1]) or keep_blank_values:
- name = nv[0].replace('+', ' ')
- name = compat_urllib_parse_unquote(
- name, encoding=encoding, errors=errors)
- name = _coerce_result(name)
- value = nv[1].replace('+', ' ')
- value = compat_urllib_parse_unquote(
- value, encoding=encoding, errors=errors)
- value = _coerce_result(value)
- r.append((name, value))
- return r
-
- def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
- encoding='utf-8', errors='replace'):
- parsed_result = {}
- pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
- encoding=encoding, errors=errors)
- for name, value in pairs:
- if name in parsed_result:
- parsed_result[name].append(value)
- else:
- parsed_result[name] = [value]
- return parsed_result
-
-try:
- compat_str = unicode # Python 2
-except NameError:
- compat_str = str
-
-try:
- compat_chr = unichr # Python 2
-except NameError:
- compat_chr = chr
-
-try:
- from xml.etree.ElementTree import ParseError as compat_xml_parse_error
-except ImportError: # Python 2.6
- from xml.parsers.expat import ExpatError as compat_xml_parse_error
-
-try:
- from shlex import quote as shlex_quote
-except ImportError: # Python < 3.3
- def shlex_quote(s):
- return "'" + s.replace("'", "'\"'\"'") + "'"
-
-
-def compat_ord(c):
- if type(c) is int: return c
- else: return ord(c)
# This is not clearly defined otherwise
compiled_regex_type = type(re.compile(''))
@@ -214,6 +59,7 @@ std_headers = {
'Accept-Language': 'en-us,en;q=0.5',
}
+
def preferredencoding():
"""Get preferred encoding.
@@ -222,28 +68,33 @@ def preferredencoding():
"""
try:
pref = locale.getpreferredencoding()
- u'TEST'.encode(pref)
+ 'TEST'.encode(pref)
except:
pref = 'UTF-8'
return pref
-if sys.version_info < (3,0):
- def compat_print(s):
- print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
-else:
- def compat_print(s):
- assert type(s) == type(u'')
- print(s)
-
def write_json_file(obj, fn):
- """ Encode obj as JSON and write it to fn, atomically """
+ """ Encode obj as JSON and write it to fn, atomically if possible """
+
+ fn = encodeFilename(fn)
+ if sys.version_info < (3, 0) and sys.platform != 'win32':
+ encoding = get_filesystem_encoding()
+ # os.path.basename returns a bytes object, but NamedTemporaryFile
+ # will fail if the filename contains non ascii characters unless we
+ # use a unicode object
+ path_basename = lambda f: os.path.basename(fn).decode(encoding)
+ # the same for os.path.dirname
+ path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
+ else:
+ path_basename = os.path.basename
+ path_dirname = os.path.dirname
args = {
'suffix': '.tmp',
- 'prefix': os.path.basename(fn) + '.',
- 'dir': os.path.dirname(fn),
+ 'prefix': path_basename(fn) + '.',
+ 'dir': path_dirname(fn),
'delete': False,
}
@@ -262,6 +113,13 @@ def write_json_file(obj, fn):
try:
with tf:
json.dump(obj, tf)
+ if sys.platform == 'win32':
+ # Need to remove existing file on Windows, else os.rename raises
+ # WindowsError or FileExistsError.
+ try:
+ os.unlink(fn)
+ except OSError:
+ pass
os.rename(tf.name, fn)
except:
try:
@@ -276,7 +134,7 @@ if sys.version_info >= (2, 7):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z-]+$', key)
assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
- expr = xpath + u"[@%s='%s']" % (key, val)
+ expr = xpath + "[@%s='%s']" % (key, val)
return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val):
@@ -292,6 +150,8 @@ else:
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
+
+
def xpath_with_ns(path, ns_map):
components = [c.split(':') for c in path.split('/')]
replaced = []
@@ -309,7 +169,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
xpath = xpath.encode('ascii')
n = node.find(xpath)
- if n is None:
+ if n is None or n.text is None:
if fatal:
name = xpath if name is None else name
raise ExtractorError('Could not find XML element %s' % name)
@@ -318,131 +178,40 @@ def xpath_text(node, xpath, name=None, fatal=False):
return n.text
-compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class BaseHTMLParser(compat_html_parser.HTMLParser):
- def __init(self):
- compat_html_parser.HTMLParser.__init__(self)
- self.html = None
-
- def loads(self, html):
- self.html = html
- self.feed(html)
- self.close()
-
-class AttrParser(BaseHTMLParser):
- """Modified HTMLParser that isolates a tag with the specified attribute"""
- def __init__(self, attribute, value):
- self.attribute = attribute
- self.value = value
- self.result = None
- self.started = False
- self.depth = {}
- self.watch_startpos = False
- self.error_count = 0
- BaseHTMLParser.__init__(self)
-
- def error(self, message):
- if self.error_count > 10 or self.started:
- raise compat_html_parser.HTMLParseError(message, self.getpos())
- self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
- self.error_count += 1
- self.goahead(1)
-
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if self.started:
- self.find_startpos(None)
- if self.attribute in attrs and attrs[self.attribute] == self.value:
- self.result = [tag]
- self.started = True
- self.watch_startpos = True
- if self.started:
- if not tag in self.depth: self.depth[tag] = 0
- self.depth[tag] += 1
-
- def handle_endtag(self, tag):
- if self.started:
- if tag in self.depth: self.depth[tag] -= 1
- if self.depth[self.result[0]] == 0:
- self.started = False
- self.result.append(self.getpos())
-
- def find_startpos(self, x):
- """Needed to put the start position of the result (self.result[1])
- after the opening tag with the requested id"""
- if self.watch_startpos:
- self.watch_startpos = False
- self.result.append(self.getpos())
- handle_entityref = handle_charref = handle_data = handle_comment = \
- handle_decl = handle_pi = unknown_decl = find_startpos
-
- def get_result(self):
- if self.result is None:
- return None
- if len(self.result) != 3:
- return None
- lines = self.html.split('\n')
- lines = lines[self.result[1][0]-1:self.result[2][0]]
- lines[0] = lines[0][self.result[1][1]:]
- if len(lines) == 1:
- lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
- lines[-1] = lines[-1][:self.result[2][1]]
- return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
- AttrParser.parse_endtag = (lambda self, i:
- i + len("</scr'+'ipt>")
- if self.rawdata[i:].startswith("</scr'+'ipt>")
- else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
def get_element_by_id(id, html):
"""Return the content of the tag with the specified ID in the passed HTML document"""
return get_element_by_attribute("id", id, html)
+
def get_element_by_attribute(attribute, value, html):
"""Return the content of the tag with the specified attribute in the passed HTML document"""
- parser = AttrParser(attribute, value)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
-class MetaParser(BaseHTMLParser):
- """
- Modified HTMLParser that isolates a meta tag with the specified name
- attribute.
- """
- def __init__(self, name):
- BaseHTMLParser.__init__(self)
- self.name = name
- self.content = None
- self.result = None
-
- def handle_starttag(self, tag, attrs):
- if tag != 'meta':
- return
- attrs = dict(attrs)
- if attrs.get('name') == self.name:
- self.result = attrs.get('content')
+ m = re.search(r'''(?xs)
+ <([a-zA-Z0-9:._-]+)
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s+%s=['"]?%s['"]?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ \s*>
+ (?P<content>.*?)
+ </\1>
+ ''' % (re.escape(attribute), re.escape(value)), html)
- def get_result(self):
- return self.result
+ if not m:
+ return None
+ res = m.group('content')
-def get_meta_content(name, html):
- """
- Return the content attribute from the meta tag with the given name attribute.
- """
- parser = MetaParser(name)
- try:
- parser.loads(html)
- except compat_html_parser.HTMLParseError:
- pass
- return parser.get_result()
+ if res.startswith('"') or res.startswith("'"):
+ res = res[1:-1]
+
+ return unescapeHTML(res)
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
+
+ if html is None: # Convenience for sanitizing descriptions etc.
+ return html
+
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
@@ -465,7 +234,7 @@ def sanitize_open(filename, open_mode):
It returns the tuple (stream, definitive_file_name).
"""
try:
- if filename == u'-':
+ if filename == '-':
if sys.platform == 'win32':
import msvcrt
msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
@@ -478,9 +247,9 @@ def sanitize_open(filename, open_mode):
# In case of error, try to remove win32 forbidden chars
alt_filename = os.path.join(
- re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
- for path_part in os.path.split(filename)
- )
+ re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
+ for path_part in os.path.split(filename)
+ )
if alt_filename == filename:
raise
else:
@@ -497,6 +266,7 @@ def timeconvert(timestr):
timestamp = email.utils.mktime_tz(timetuple)
return timestamp
+
def sanitize_filename(s, restricted=False, is_id=False):
"""Sanitizes a string so it could be used as part of a filename.
If restricted is set, use a stricter subset of allowed characters.
@@ -517,7 +287,9 @@ def sanitize_filename(s, restricted=False, is_id=False):
return '_'
return char
- result = u''.join(map(replace_insane, s))
+ # Handle timestamps
+ s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
+ result = ''.join(map(replace_insane, s))
if not is_id:
while '__' in result:
result = result.replace('__', '_')
@@ -529,6 +301,7 @@ def sanitize_filename(s, restricted=False, is_id=False):
result = '_'
return result
+
def orderedSet(iterable):
""" Remove all duplicates from the input iterable """
res = []
@@ -547,15 +320,15 @@ def _htmlentity_transform(entity):
mobj = re.match(r'#(x?[0-9]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
- if numstr.startswith(u'x'):
+ if numstr.startswith('x'):
base = 16
- numstr = u'0%s' % numstr
+ numstr = '0%s' % numstr
else:
base = 10
return compat_chr(int(numstr, base))
# Unknown entity in name, return its literal representation
- return (u'&%s;' % entity)
+ return ('&%s;' % entity)
def unescapeHTML(s):
@@ -579,7 +352,7 @@ def encodeFilename(s, for_subprocess=False):
return s
if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
- # Pass u'' directly to use Unicode APIs on Windows 2000 and up
+ # Pass '' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
if not for_subprocess:
@@ -599,7 +372,7 @@ def encodeArgument(s):
if not isinstance(s, compat_str):
# Legacy code that uses byte strings
# Uncomment the following line after fixing all post processors
- #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
+ # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
s = s.decode('ascii')
return encodeFilename(s, True)
@@ -613,6 +386,7 @@ def decodeOption(optval):
assert isinstance(optval, compat_str)
return optval
+
def formatSeconds(secs):
if secs > 3600:
return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
@@ -622,48 +396,34 @@ def formatSeconds(secs):
return '%d' % secs
-def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
- if sys.version_info < (3, 2):
- import httplib
-
- class HTTPSConnectionV3(httplib.HTTPSConnection):
- def __init__(self, *args, **kwargs):
- httplib.HTTPSConnection.__init__(self, *args, **kwargs)
-
- def connect(self):
- sock = socket.create_connection((self.host, self.port), self.timeout)
- if getattr(self, '_tunnel_host', False):
- self.sock = sock
- self._tunnel()
- try:
- self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
- except ssl.SSLError:
- self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
-
- class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
- def https_open(self, req):
- return self.do_open(HTTPSConnectionV3, req)
- return HTTPSHandlerV3(**kwargs)
- elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
- context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
- context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
+def make_HTTPS_handler(params, **kwargs):
+ opts_no_check_certificate = params.get('nocheckcertificate', False)
+ if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
+ context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
if opts_no_check_certificate:
+ context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
- return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
+ try:
+ return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
+ except TypeError:
+ # Python 2.7.8
+ # (create_default_context present but HTTPSHandler has no context=)
+ pass
+
+ if sys.version_info < (3, 2):
+ return YoutubeDLHTTPSHandler(params, **kwargs)
else: # Python < 3.4
- context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+ context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
context.verify_mode = (ssl.CERT_NONE
if opts_no_check_certificate
else ssl.CERT_REQUIRED)
context.set_default_verify_paths()
- try:
- context.load_default_certs()
- except AttributeError:
- pass # Python < 3.4
- return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
+ return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
+
class ExtractorError(Exception):
"""Error during info extraction."""
+
def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
""" tb, if given, is the original traceback (so that it can be printed out).
If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
@@ -674,9 +434,15 @@ class ExtractorError(Exception):
if video_id is not None:
msg = video_id + ': ' + msg
if cause:
- msg += u' (caused by %r)' % cause
+ msg += ' (caused by %r)' % cause
if not expected:
- msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
+ if ytdl_is_updateable():
+ update_cmd = 'type youtube-dl -U to update'
+ else:
+ update_cmd = 'see https://yt-dl.org/update on how to update'
+ msg += '; please report this issue on https://yt-dl.org/bug .'
+ msg += ' Make sure you are using the latest version; %s.' % update_cmd
+ msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
super(ExtractorError, self).__init__(msg)
self.traceback = tb
@@ -687,7 +453,14 @@ class ExtractorError(Exception):
def format_traceback(self):
if self.traceback is None:
return None
- return u''.join(traceback.format_tb(self.traceback))
+ return ''.join(traceback.format_tb(self.traceback))
+
+
+class UnsupportedError(ExtractorError):
+ def __init__(self, url):
+ super(UnsupportedError, self).__init__(
+ 'Unsupported URL: %s' % url, expected=True)
+ self.url = url
class RegexNotFoundError(ExtractorError):
@@ -702,6 +475,7 @@ class DownloadError(Exception):
configured to continue on errors. They will contain the appropriate
error message.
"""
+
def __init__(self, msg, exc_info=None):
""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
super(DownloadError, self).__init__(msg)
@@ -723,9 +497,11 @@ class PostProcessingError(Exception):
This exception may be raised by PostProcessor's .run() method to
indicate an error in the postprocessing task.
"""
+
def __init__(self, msg):
self.msg = msg
+
class MaxDownloadsReached(Exception):
""" --max-downloads limit has been reached. """
pass
@@ -755,6 +531,29 @@ class ContentTooShortError(Exception):
self.downloaded = downloaded
self.expected = expected
+
+def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
+ hc = http_class(*args, **kwargs)
+ source_address = ydl_handler._params.get('source_address')
+ if source_address is not None:
+ sa = (source_address, 0)
+ if hasattr(hc, 'source_address'): # Python 2.7+
+ hc.source_address = sa
+ else: # Python 2.6
+ def _hc_connect(self, *args, **kwargs):
+ sock = compat_socket_create_connection(
+ (self.host, self.port), self.timeout, sa)
+ if is_https:
+ self.sock = ssl.wrap_socket(
+ sock, self.key_file, self.cert_file,
+ ssl_version=ssl.PROTOCOL_TLSv1)
+ else:
+ self.sock = sock
+ hc.connect = functools.partial(_hc_connect, hc)
+
+ return hc
+
+
class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
"""Handler for HTTP requests and responses.
@@ -773,6 +572,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
public domain.
"""
+ def __init__(self, params, *args, **kwargs):
+ compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
+ self._params = params
+
+ def http_open(self, req):
+ return self.do_open(functools.partial(
+ _create_http_connection, self, compat_http_client.HTTPConnection, False),
+ req)
+
@staticmethod
def deflate(data):
try:
@@ -790,17 +598,14 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
def http_request(self, req):
for h, v in std_headers.items():
- if h not in req.headers:
+ # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
+ # The dict keys are capitalized because of this bug by urllib
+ if h.capitalize() not in req.headers:
req.add_header(h, v)
if 'Youtubedl-no-compression' in req.headers:
if 'Accept-encoding' in req.headers:
del req.headers['Accept-encoding']
del req.headers['Youtubedl-no-compression']
- if 'Youtubedl-user-agent' in req.headers:
- if 'User-agent' in req.headers:
- del req.headers['User-agent']
- req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
- del req.headers['Youtubedl-user-agent']
if sys.version_info < (2, 7) and '#' in req.get_full_url():
# Python 2.6 is brain-dead when it comes to fragments
@@ -842,6 +647,18 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
https_response = http_response
+class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
+ def __init__(self, params, https_conn_class=None, *args, **kwargs):
+ compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
+ self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
+ self._params = params
+
+ def https_open(self, req):
+ return self.do_open(functools.partial(
+ _create_http_connection, self, self._https_conn_class, True),
+ req)
+
+
def parse_iso8601(date_str, delimiter='T'):
""" Return a UNIX timestamp from the given date """
@@ -849,7 +666,7 @@ def parse_iso8601(date_str, delimiter='T'):
return None
m = re.search(
- r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
+ r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
date_str)
if not m:
timezone = datetime.timedelta()
@@ -862,22 +679,24 @@ def parse_iso8601(date_str, delimiter='T'):
timezone = datetime.timedelta(
hours=sign * int(m.group('hours')),
minutes=sign * int(m.group('minutes')))
- date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+ date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
dt = datetime.datetime.strptime(date_str, date_format) - timezone
return calendar.timegm(dt.timetuple())
-def unified_strdate(date_str):
+def unified_strdate(date_str, day_first=True):
"""Return a string with the date in the format YYYYMMDD"""
if date_str is None:
return None
-
upload_date = None
- #Replace commas
+ # Replace commas
date_str = date_str.replace(',', ' ')
# %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
+ # Remove AM/PM + timezone
+ date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
+
format_expressions = [
'%d %B %Y',
'%d %b %Y',
@@ -886,13 +705,10 @@ def unified_strdate(date_str):
'%b %dst %Y %I:%M%p',
'%b %dnd %Y %I:%M%p',
'%b %dth %Y %I:%M%p',
+ '%Y %m %d',
'%Y-%m-%d',
'%Y/%m/%d',
- '%d.%m.%Y',
- '%d/%m/%Y',
- '%d/%m/%y',
'%Y/%m/%d %H:%M:%S',
- '%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f',
'%d.%m.%Y %H:%M',
@@ -904,6 +720,20 @@ def unified_strdate(date_str):
'%Y-%m-%dT%H:%M:%S.%f',
'%Y-%m-%dT%H:%M',
]
+ if day_first:
+ format_expressions.extend([
+ '%d.%m.%Y',
+ '%d/%m/%Y',
+ '%d/%m/%y',
+ '%d/%m/%Y %H:%M:%S',
+ ])
+ else:
+ format_expressions.extend([
+ '%m.%d.%Y',
+ '%m/%d/%Y',
+ '%m/%d/%y',
+ '%m/%d/%Y %H:%M:%S',
+ ])
for expression in format_expressions:
try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -915,25 +745,30 @@ def unified_strdate(date_str):
upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
return upload_date
-def determine_ext(url, default_ext=u'unknown_video'):
+
+def determine_ext(url, default_ext='unknown_video'):
if url is None:
return default_ext
- guess = url.partition(u'?')[0].rpartition(u'.')[2]
+ guess = url.partition('?')[0].rpartition('.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
else:
return default_ext
+
def subtitles_filename(filename, sub_lang, sub_format):
- return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
+ return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
+
def date_from_str(date_str):
"""
Return a datetime object from a string in the format YYYYMMDD or
(now|today)[+-][0-9](day|week|month|year)(s)?"""
today = datetime.date.today()
- if date_str == 'now'or date_str == 'today':
+ if date_str in ('now', 'today'):
return today
+ if date_str == 'yesterday':
+ return today - datetime.timedelta(days=1)
match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
if match is not None:
sign = match.group('sign')
@@ -941,7 +776,7 @@ def date_from_str(date_str):
if sign == '-':
time = -time
unit = match.group('unit')
- #A bad aproximation?
+ # A bad aproximation?
if unit == 'month':
unit = 'day'
time *= 30
@@ -952,7 +787,8 @@ def date_from_str(date_str):
delta = datetime.timedelta(**{unit: time})
return today + delta
return datetime.datetime.strptime(date_str, "%Y%m%d").date()
-
+
+
def hyphenate_date(date_str):
"""
Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
@@ -962,8 +798,10 @@ def hyphenate_date(date_str):
else:
return date_str
+
class DateRange(object):
"""Represents a time interval between two dates"""
+
def __init__(self, start=None, end=None):
"""start and end must be strings in the format accepted by date"""
if start is not None:
@@ -976,17 +814,20 @@ class DateRange(object):
self.end = datetime.datetime.max.date()
if self.start > self.end:
raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
+
@classmethod
def day(cls, day):
"""Returns a range that only contains the given day"""
- return cls(day,day)
+ return cls(day, day)
+
def __contains__(self, date):
"""Check if the date is in the range"""
if not isinstance(date, datetime.date):
date = date_from_str(date)
return self.start <= date <= self.end
+
def __str__(self):
- return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
+ return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
def platform_name():
@@ -1017,27 +858,30 @@ def _windows_write_string(s, out):
except AttributeError:
# If the output stream doesn't have a fileno, it's virtual
return False
+ except io.UnsupportedOperation:
+ # Some strange Windows pseudo files?
+ return False
if fileno not in WIN_OUTPUT_IDS:
return False
GetStdHandle = ctypes.WINFUNCTYPE(
ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
- ("GetStdHandle", ctypes.windll.kernel32))
+ (b"GetStdHandle", ctypes.windll.kernel32))
h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
WriteConsoleW = ctypes.WINFUNCTYPE(
ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
- ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
+ ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
written = ctypes.wintypes.DWORD(0)
- GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
+ GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
FILE_TYPE_CHAR = 0x0002
FILE_TYPE_REMOTE = 0x8000
GetConsoleMode = ctypes.WINFUNCTYPE(
ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
ctypes.POINTER(ctypes.wintypes.DWORD))(
- ("GetConsoleMode", ctypes.windll.kernel32))
+ (b"GetConsoleMode", ctypes.windll.kernel32))
INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
def not_a_console(handle):
@@ -1105,10 +949,7 @@ def bytes_to_intlist(bs):
def intlist_to_bytes(xs):
if not xs:
return b''
- if isinstance(chr(0), bytes): # Python 2
- return ''.join([chr(x) for x in xs])
- else:
- return bytes(xs)
+ return struct_pack('%dB' % len(xs), *xs)
# Cross-platform file locking
@@ -1207,17 +1048,20 @@ class locked_file(object):
return self.f.read(*args)
+def get_filesystem_encoding():
+ encoding = sys.getfilesystemencoding()
+ return encoding if encoding is not None else 'utf-8'
+
+
def shell_quote(args):
quoted_args = []
- encoding = sys.getfilesystemencoding()
- if encoding is None:
- encoding = 'utf-8'
+ encoding = get_filesystem_encoding()
for a in args:
if isinstance(a, bytes):
# We may get a filename encoded with 'encodeFilename'
a = a.decode(encoding)
quoted_args.append(pipes.quote(a))
- return u' '.join(quoted_args)
+ return ' '.join(quoted_args)
def takewhile_inclusive(pred, seq):
@@ -1233,35 +1077,89 @@ def smuggle_url(url, data):
""" Pass additional data in a URL for internal use. """
sdata = compat_urllib_parse.urlencode(
- {u'__youtubedl_smuggle': json.dumps(data)})
- return url + u'#' + sdata
+ {'__youtubedl_smuggle': json.dumps(data)})
+ return url + '#' + sdata
def unsmuggle_url(smug_url, default=None):
- if not '#__youtubedl_smuggle' in smug_url:
+ if '#__youtubedl_smuggle' not in smug_url:
return smug_url, default
- url, _, sdata = smug_url.rpartition(u'#')
- jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
+ url, _, sdata = smug_url.rpartition('#')
+ jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
data = json.loads(jsond)
return url, data
def format_bytes(bytes):
if bytes is None:
- return u'N/A'
+ return 'N/A'
if type(bytes) is str:
bytes = float(bytes)
if bytes == 0.0:
exponent = 0
else:
exponent = int(math.log(bytes, 1024.0))
- suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+ suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
converted = float(bytes) / float(1024 ** exponent)
- return u'%.2f%s' % (converted, suffix)
+ return '%.2f%s' % (converted, suffix)
+
+
+def parse_filesize(s):
+ if s is None:
+ return None
+
+ # The lower-case forms are of course incorrect and inofficial,
+ # but we support those too
+ _UNIT_TABLE = {
+ 'B': 1,
+ 'b': 1,
+ 'KiB': 1024,
+ 'KB': 1000,
+ 'kB': 1024,
+ 'Kb': 1000,
+ 'MiB': 1024 ** 2,
+ 'MB': 1000 ** 2,
+ 'mB': 1024 ** 2,
+ 'Mb': 1000 ** 2,
+ 'GiB': 1024 ** 3,
+ 'GB': 1000 ** 3,
+ 'gB': 1024 ** 3,
+ 'Gb': 1000 ** 3,
+ 'TiB': 1024 ** 4,
+ 'TB': 1000 ** 4,
+ 'tB': 1024 ** 4,
+ 'Tb': 1000 ** 4,
+ 'PiB': 1024 ** 5,
+ 'PB': 1000 ** 5,
+ 'pB': 1024 ** 5,
+ 'Pb': 1000 ** 5,
+ 'EiB': 1024 ** 6,
+ 'EB': 1000 ** 6,
+ 'eB': 1024 ** 6,
+ 'Eb': 1000 ** 6,
+ 'ZiB': 1024 ** 7,
+ 'ZB': 1000 ** 7,
+ 'zB': 1024 ** 7,
+ 'Zb': 1000 ** 7,
+ 'YiB': 1024 ** 8,
+ 'YB': 1000 ** 8,
+ 'yB': 1024 ** 8,
+ 'Yb': 1000 ** 8,
+ }
+
+ units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
+ m = re.match(
+ r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+ if not m:
+ return None
+
+ num_str = m.group('num').replace(',', '.')
+ mult = _UNIT_TABLE[m.group('unit')]
+ return int(float(num_str) * mult)
def get_term_width():
- columns = os.environ.get('COLUMNS', None)
+ columns = compat_getenv('COLUMNS', None)
if columns:
return int(columns)
@@ -1280,8 +1178,8 @@ def month_by_name(name):
""" Return the number of a month by (locale-independently) English name """
ENGLISH_NAMES = [
- u'January', u'February', u'March', u'April', u'May', u'June',
- u'July', u'August', u'September', u'October', u'November', u'December']
+ 'January', 'February', 'March', 'April', 'May', 'June',
+ 'July', 'August', 'September', 'October', 'November', 'December']
try:
return ENGLISH_NAMES.index(name) + 1
except ValueError:
@@ -1292,7 +1190,7 @@ def fix_xml_ampersands(xml_str):
"""Replace all the '&' by '&amp;' in XML"""
return re.sub(
r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
- u'&amp;',
+ '&amp;',
xml_str)
@@ -1325,7 +1223,7 @@ def remove_end(s, end):
def url_basename(url):
path = compat_urlparse.urlparse(url).path
- return path.strip(u'/').split(u'/')[-1]
+ return path.strip('/').split('/')[-1]
class HEADRequest(compat_urllib_request.Request):
@@ -1350,7 +1248,7 @@ def str_to_int(int_str):
""" A more relaxed version of int_or_none """
if int_str is None:
return None
- int_str = re.sub(r'[,\.\+]', u'', int_str)
+ int_str = re.sub(r'[,\.\+]', '', int_str)
return int(int_str)
@@ -1359,28 +1257,44 @@ def float_or_none(v, scale=1, invscale=1, default=None):
def parse_duration(s):
- if s is None:
+ if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
return None
s = s.strip()
m = re.match(
- r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
+ r'''(?ix)(?:P?T)?
+ (?:
+ (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
+ (?P<only_hours>[0-9.]+)\s*(?:hours?)|
+
+ (?:
+ (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
+ (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
+ )?
+ (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
+ )$''', s)
if not m:
return None
- res = int(m.group('secs'))
+ res = 0
+ if m.group('only_mins'):
+ return float_or_none(m.group('only_mins'), invscale=60)
+ if m.group('only_hours'):
+ return float_or_none(m.group('only_hours'), invscale=60 * 60)
+ if m.group('secs'):
+ res += int(m.group('secs'))
if m.group('mins'):
res += int(m.group('mins')) * 60
- if m.group('hours'):
- res += int(m.group('hours')) * 60 * 60
+ if m.group('hours'):
+ res += int(m.group('hours')) * 60 * 60
if m.group('ms'):
res += float(m.group('ms'))
return res
def prepend_extension(filename, ext):
- name, real_ext = os.path.splitext(filename)
- return u'{0}.{1}{2}'.format(name, ext, real_ext)
+ name, real_ext = os.path.splitext(filename)
+ return '{0}.{1}{2}'.format(name, ext, real_ext)
def check_executable(exe, args=[]):
@@ -1393,6 +1307,32 @@ def check_executable(exe, args=[]):
return exe
+def get_exe_version(exe, args=['--version'],
+ version_re=None, unrecognized='present'):
+ """ Returns the version of the specified executable,
+ or False if the executable is not present """
+ try:
+ out, _ = subprocess.Popen(
+ [exe] + args,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
+ except OSError:
+ return False
+ if isinstance(out, bytes): # Python 2.x
+ out = out.decode('ascii', 'ignore')
+ return detect_exe_version(out, version_re, unrecognized)
+
+
+def detect_exe_version(output, version_re=None, unrecognized='present'):
+ assert isinstance(output, compat_str)
+ if version_re is None:
+ version_re = r'version\s+([-0-9._a-zA-Z]+)'
+ m = re.search(version_re, output)
+ if m:
+ return m.group(1)
+ else:
+ return unrecognized
+
+
class PagedList(object):
def __len__(self):
# This is only useful for tests
@@ -1483,7 +1423,7 @@ def escape_rfc3986(s):
"""Escape non-ASCII characters as suggested by RFC 3986"""
if sys.version_info < (3, 0) and isinstance(s, unicode):
s = s.encode('utf-8')
- return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
+ return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
def escape_url(url):
@@ -1497,7 +1437,7 @@ def escape_url(url):
).geturl()
try:
- struct.pack(u'!I', 0)
+ struct.pack('!I', 0)
except TypeError:
# In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
def struct_pack(spec, *args):
@@ -1518,7 +1458,7 @@ def read_batch_urls(batch_fd):
def fixup(url):
if not isinstance(url, compat_str):
url = url.decode('utf-8', 'replace')
- BOM_UTF8 = u'\xef\xbb\xbf'
+ BOM_UTF8 = '\xef\xbb\xbf'
if url.startswith(BOM_UTF8):
url = url[len(BOM_UTF8):]
url = url.strip()
@@ -1557,15 +1497,6 @@ def parse_xml(s):
return tree
-if sys.version_info < (3, 0) and sys.platform == 'win32':
- def compat_getpass(prompt, *args, **kwargs):
- if isinstance(prompt, compat_str):
- prompt = prompt.encode(preferredencoding())
- return getpass.getpass(prompt, *args, **kwargs)
-else:
- compat_getpass = getpass.getpass
-
-
US_RATINGS = {
'G': 0,
'PG': 10,
@@ -1583,7 +1514,8 @@ def parse_age_limit(s):
def strip_jsonp(code):
- return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
+ return re.sub(
+ r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
def js_to_json(code):
@@ -1623,18 +1555,6 @@ def qualities(quality_ids):
DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
-try:
- subprocess_check_output = subprocess.check_output
-except AttributeError:
- def subprocess_check_output(*args, **kwargs):
- assert 'input' not in kwargs
- p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
- output, _ = p.communicate()
- ret = p.poll()
- if ret:
- raise subprocess.CalledProcessError(ret, p.args, output=output)
- return output
-
def limit_length(s, length):
""" Add ellipses to overly long strings """
@@ -1644,3 +1564,106 @@ def limit_length(s, length):
if len(s) > length:
return s[:length - len(ELLIPSES)] + ELLIPSES
return s
+
+
+def version_tuple(v):
+ return tuple(int(e) for e in re.split(r'[-.]', v))
+
+
+def is_outdated_version(version, limit, assume_new=True):
+ if not version:
+ return not assume_new
+ try:
+ return version_tuple(version) < version_tuple(limit)
+ except ValueError:
+ return not assume_new
+
+
+def ytdl_is_updateable():
+ """ Returns if youtube-dl can be updated with -U """
+ from zipimport import zipimporter
+
+ return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
+
+
+def args_to_str(args):
+ # Get a short string representation for a subprocess command
+ return ' '.join(shlex_quote(a) for a in args)
+
+
+def urlhandle_detect_ext(url_handle):
+ try:
+ url_handle.headers
+ getheader = lambda h: url_handle.headers[h]
+ except AttributeError: # Python < 3
+ getheader = url_handle.info().getheader
+
+ cd = getheader('Content-Disposition')
+ if cd:
+ m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+ if m:
+ e = determine_ext(m.group('filename'), default_ext=None)
+ if e:
+ return e
+
+ return getheader('Content-Type').split("/")[1]
+
+
+def age_restricted(content_limit, age_limit):
+ """ Returns True iff the content should be blocked """
+
+ if age_limit is None: # No limit set
+ return False
+ if content_limit is None:
+ return False # Content available for everyone
+ return age_limit < content_limit
+
+
+def is_html(first_bytes):
+ """ Detect whether a file contains HTML by examining its first bytes. """
+
+ BOMS = [
+ (b'\xef\xbb\xbf', 'utf-8'),
+ (b'\x00\x00\xfe\xff', 'utf-32-be'),
+ (b'\xff\xfe\x00\x00', 'utf-32-le'),
+ (b'\xff\xfe', 'utf-16-le'),
+ (b'\xfe\xff', 'utf-16-be'),
+ ]
+ for bom, enc in BOMS:
+ if first_bytes.startswith(bom):
+ s = first_bytes[len(bom):].decode(enc, 'replace')
+ break
+ else:
+ s = first_bytes.decode('utf-8', 'replace')
+
+ return re.match(r'^\s*<', s)
+
+
+def determine_protocol(info_dict):
+ protocol = info_dict.get('protocol')
+ if protocol is not None:
+ return protocol
+
+ url = info_dict['url']
+ if url.startswith('rtmp'):
+ return 'rtmp'
+ elif url.startswith('mms'):
+ return 'mms'
+ elif url.startswith('rtsp'):
+ return 'rtsp'
+
+ ext = determine_ext(url)
+ if ext == 'm3u8':
+ return 'm3u8'
+ elif ext == 'f4m':
+ return 'f4m'
+
+ return compat_urllib_parse_urlparse(url).scheme
+
+
+def render_table(header_row, data):
+ """ Render a list of rows, each as a list of values """
+ table = [header_row] + data
+ max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
+ format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
+ return '\n'.join(format_str % tuple(row) for row in table)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index e7f6adef1..35f3e1b6b 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,3 @@
+from __future__ import unicode_literals
-__version__ = '2014.10.18'
+__version__ = '2015.01.23.4'