aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/__init__.py329
-rw-r--r--youtube_dl/extractor/abc.py67
-rw-r--r--youtube_dl/extractor/academicearth.py2
-rw-r--r--youtube_dl/extractor/addanime.py15
-rw-r--r--youtube_dl/extractor/adobetv.py67
-rw-r--r--youtube_dl/extractor/adultswim.py94
-rw-r--r--youtube_dl/extractor/aftenposten.py86
-rw-r--r--youtube_dl/extractor/aftonbladet.py28
-rw-r--r--youtube_dl/extractor/airmozilla.py74
-rw-r--r--youtube_dl/extractor/aljazeera.py5
-rw-r--r--youtube_dl/extractor/anitube.py4
-rw-r--r--youtube_dl/extractor/appleconnect.py50
-rw-r--r--youtube_dl/extractor/appletrailers.py80
-rw-r--r--youtube_dl/extractor/archiveorg.py2
-rw-r--r--youtube_dl/extractor/ard.py210
-rw-r--r--youtube_dl/extractor/arte.py18
-rw-r--r--youtube_dl/extractor/atresplayer.py26
-rw-r--r--youtube_dl/extractor/baidu.py69
-rw-r--r--youtube_dl/extractor/bambuser.py80
-rw-r--r--youtube_dl/extractor/bandcamp.py34
-rw-r--r--youtube_dl/extractor/bbc.py937
-rw-r--r--youtube_dl/extractor/bbccouk.py340
-rw-r--r--youtube_dl/extractor/beatportpro.py103
-rw-r--r--youtube_dl/extractor/beeg.py70
-rw-r--r--youtube_dl/extractor/bet.py17
-rw-r--r--youtube_dl/extractor/bild.py27
-rw-r--r--youtube_dl/extractor/bilibili.py98
-rw-r--r--youtube_dl/extractor/blinkx.py49
-rw-r--r--youtube_dl/extractor/bliptv.py116
-rw-r--r--youtube_dl/extractor/bloomberg.py48
-rw-r--r--youtube_dl/extractor/br.py39
-rw-r--r--youtube_dl/extractor/breakcom.py3
-rw-r--r--youtube_dl/extractor/brightcove.py224
-rw-r--r--youtube_dl/extractor/buzzfeed.py5
-rw-r--r--youtube_dl/extractor/byutv.py2
-rw-r--r--youtube_dl/extractor/canalc2.py43
-rw-r--r--youtube_dl/extractor/canalplus.py29
-rw-r--r--youtube_dl/extractor/cbs.py36
-rw-r--r--youtube_dl/extractor/cbsnews.py7
-rw-r--r--youtube_dl/extractor/cbssports.py30
-rw-r--r--youtube_dl/extractor/ccc.py4
-rw-r--r--youtube_dl/extractor/ceskatelevize.py183
-rw-r--r--youtube_dl/extractor/channel9.py66
-rw-r--r--youtube_dl/extractor/chaturbate.py50
-rw-r--r--youtube_dl/extractor/chilloutzone.py2
-rw-r--r--youtube_dl/extractor/chirbit.py84
-rw-r--r--youtube_dl/extractor/cinemassacre.py111
-rw-r--r--youtube_dl/extractor/clipfish.py67
-rw-r--r--youtube_dl/extractor/clipsyndicate.py14
-rw-r--r--youtube_dl/extractor/cloudy.py3
-rw-r--r--youtube_dl/extractor/clubic.py9
-rw-r--r--youtube_dl/extractor/clyp.py57
-rw-r--r--youtube_dl/extractor/cmt.py5
-rw-r--r--youtube_dl/extractor/cnet.py19
-rw-r--r--youtube_dl/extractor/cnn.py8
-rw-r--r--youtube_dl/extractor/collegerama.py4
-rw-r--r--youtube_dl/extractor/comcarcoff.py2
-rw-r--r--youtube_dl/extractor/comedycentral.py12
-rw-r--r--youtube_dl/extractor/common.py534
-rw-r--r--youtube_dl/extractor/condenast.py50
-rw-r--r--youtube_dl/extractor/cracked.py58
-rw-r--r--youtube_dl/extractor/criterion.py4
-rw-r--r--youtube_dl/extractor/crooksandliars.py60
-rw-r--r--youtube_dl/extractor/crunchyroll.py262
-rw-r--r--youtube_dl/extractor/cspan.py124
-rw-r--r--youtube_dl/extractor/ctsnews.py1
-rw-r--r--youtube_dl/extractor/dailymotion.py294
-rw-r--r--youtube_dl/extractor/dbtv.py14
-rw-r--r--youtube_dl/extractor/dcn.py82
-rw-r--r--youtube_dl/extractor/defense.py5
-rw-r--r--youtube_dl/extractor/democracynow.py88
-rw-r--r--youtube_dl/extractor/dfb.py27
-rw-r--r--youtube_dl/extractor/dhm.py59
-rw-r--r--youtube_dl/extractor/discovery.py52
-rw-r--r--youtube_dl/extractor/divxstage.py27
-rw-r--r--youtube_dl/extractor/dotsub.py3
-rw-r--r--youtube_dl/extractor/douyutv.py113
-rw-r--r--youtube_dl/extractor/dplay.py51
-rw-r--r--youtube_dl/extractor/dramafever.py216
-rw-r--r--youtube_dl/extractor/drbonanza.py12
-rw-r--r--youtube_dl/extractor/dreisat.py46
-rw-r--r--youtube_dl/extractor/drtuber.py24
-rw-r--r--youtube_dl/extractor/drtv.py69
-rw-r--r--youtube_dl/extractor/dump.py6
-rw-r--r--youtube_dl/extractor/dumpert.py69
-rw-r--r--youtube_dl/extractor/eagleplatform.py110
-rw-r--r--youtube_dl/extractor/ehow.py6
-rw-r--r--youtube_dl/extractor/eighttracks.py14
-rw-r--r--youtube_dl/extractor/eitb.py95
-rw-r--r--youtube_dl/extractor/ellentv.py68
-rw-r--r--youtube_dl/extractor/embedly.py16
-rw-r--r--youtube_dl/extractor/empflix.py25
-rw-r--r--youtube_dl/extractor/engadget.py2
-rw-r--r--youtube_dl/extractor/eporner.py7
-rw-r--r--youtube_dl/extractor/eroprofile.py58
-rw-r--r--youtube_dl/extractor/escapist.py131
-rw-r--r--youtube_dl/extractor/espn.py55
-rw-r--r--youtube_dl/extractor/esri.py74
-rw-r--r--youtube_dl/extractor/europa.py93
-rw-r--r--youtube_dl/extractor/everyonesmixtape.py8
-rw-r--r--youtube_dl/extractor/expotv.py31
-rw-r--r--youtube_dl/extractor/extremetube.py65
-rw-r--r--youtube_dl/extractor/facebook.py70
-rw-r--r--youtube_dl/extractor/faz.py21
-rw-r--r--youtube_dl/extractor/fc2.py20
-rw-r--r--youtube_dl/extractor/fczenit.py41
-rw-r--r--youtube_dl/extractor/firedrive.py80
-rw-r--r--youtube_dl/extractor/firsttv.py2
-rw-r--r--youtube_dl/extractor/fivemin.py85
-rw-r--r--youtube_dl/extractor/fivetv.py88
-rw-r--r--youtube_dl/extractor/fktv.py89
-rw-r--r--youtube_dl/extractor/flickr.py27
-rw-r--r--youtube_dl/extractor/folketinget.py4
-rw-r--r--youtube_dl/extractor/footyroom.py49
-rw-r--r--youtube_dl/extractor/fourtube.py36
-rw-r--r--youtube_dl/extractor/foxnews.py15
-rw-r--r--youtube_dl/extractor/foxsports.py32
-rw-r--r--youtube_dl/extractor/francetv.py170
-rw-r--r--youtube_dl/extractor/funnyordie.py24
-rw-r--r--youtube_dl/extractor/gamersyde.py70
-rw-r--r--youtube_dl/extractor/gamespot.py66
-rw-r--r--youtube_dl/extractor/gamestar.py4
-rw-r--r--youtube_dl/extractor/gazeta.py38
-rw-r--r--youtube_dl/extractor/gdcvault.py74
-rw-r--r--youtube_dl/extractor/generic.py1020
-rw-r--r--youtube_dl/extractor/gfycat.py110
-rw-r--r--youtube_dl/extractor/giga.py3
-rw-r--r--youtube_dl/extractor/globo.py170
-rw-r--r--youtube_dl/extractor/googleplus.py2
-rw-r--r--youtube_dl/extractor/grooveshark.py191
-rw-r--r--youtube_dl/extractor/hearthisat.py8
-rw-r--r--youtube_dl/extractor/hentaistigma.py11
-rw-r--r--youtube_dl/extractor/historicfilms.py3
-rw-r--r--youtube_dl/extractor/history.py31
-rw-r--r--youtube_dl/extractor/hitbox.py79
-rw-r--r--youtube_dl/extractor/hostingbulk.py84
-rw-r--r--youtube_dl/extractor/hotnewhiphop.py8
-rw-r--r--youtube_dl/extractor/howcast.py35
-rw-r--r--youtube_dl/extractor/howstuffworks.py6
-rw-r--r--youtube_dl/extractor/hypem.py10
-rw-r--r--youtube_dl/extractor/iconosquare.py71
-rw-r--r--youtube_dl/extractor/ign.py7
-rw-r--r--youtube_dl/extractor/imdb.py31
-rw-r--r--youtube_dl/extractor/imgur.py124
-rw-r--r--youtube_dl/extractor/ina.py2
-rw-r--r--youtube_dl/extractor/indavideo.py142
-rw-r--r--youtube_dl/extractor/infoq.py18
-rw-r--r--youtube_dl/extractor/instagram.py26
-rw-r--r--youtube_dl/extractor/iprima.py22
-rw-r--r--youtube_dl/extractor/iqiyi.py279
-rw-r--r--youtube_dl/extractor/ir90tv.py42
-rw-r--r--youtube_dl/extractor/ivi.py6
-rw-r--r--youtube_dl/extractor/izlesene.py18
-rw-r--r--youtube_dl/extractor/jeuxvideo.py45
-rw-r--r--youtube_dl/extractor/kaltura.py176
-rw-r--r--youtube_dl/extractor/kanalplay.py97
-rw-r--r--youtube_dl/extractor/karaoketv.py4
-rw-r--r--youtube_dl/extractor/karrierevideos.py96
-rw-r--r--youtube_dl/extractor/keek.py39
-rw-r--r--youtube_dl/extractor/keezmovies.py8
-rw-r--r--youtube_dl/extractor/kickstarter.py15
-rw-r--r--youtube_dl/extractor/kontrtube.py40
-rw-r--r--youtube_dl/extractor/krasview.py9
-rw-r--r--youtube_dl/extractor/kuwo.py318
-rw-r--r--youtube_dl/extractor/laola1tv.py45
-rw-r--r--youtube_dl/extractor/lecture2go.py62
-rw-r--r--youtube_dl/extractor/letv.py241
-rw-r--r--youtube_dl/extractor/libsyn.py69
-rw-r--r--youtube_dl/extractor/lifenews.py130
-rw-r--r--youtube_dl/extractor/limelight.py229
-rw-r--r--youtube_dl/extractor/liveleak.py16
-rw-r--r--youtube_dl/extractor/livestream.py68
-rw-r--r--youtube_dl/extractor/lrt.py1
-rw-r--r--youtube_dl/extractor/lynda.py297
-rw-r--r--youtube_dl/extractor/mailru.py2
-rw-r--r--youtube_dl/extractor/malemotion.py6
-rw-r--r--youtube_dl/extractor/mdr.py189
-rw-r--r--youtube_dl/extractor/metacafe.py9
-rw-r--r--youtube_dl/extractor/minhateca.py8
-rw-r--r--youtube_dl/extractor/miomio.py107
-rw-r--r--youtube_dl/extractor/mit.py21
-rw-r--r--youtube_dl/extractor/mitele.py101
-rw-r--r--youtube_dl/extractor/mixcloud.py65
-rw-r--r--youtube_dl/extractor/mlb.py47
-rw-r--r--youtube_dl/extractor/moevideo.py8
-rw-r--r--youtube_dl/extractor/mofosex.py8
-rw-r--r--youtube_dl/extractor/moniker.py77
-rw-r--r--youtube_dl/extractor/mooshare.py8
-rw-r--r--youtube_dl/extractor/movieclips.py80
-rw-r--r--youtube_dl/extractor/mpora.py6
-rw-r--r--youtube_dl/extractor/mtv.py132
-rw-r--r--youtube_dl/extractor/musicvault.py76
-rw-r--r--youtube_dl/extractor/mwave.py58
-rw-r--r--youtube_dl/extractor/myspass.py3
-rw-r--r--youtube_dl/extractor/myvi.py60
-rw-r--r--youtube_dl/extractor/myvideo.py15
-rw-r--r--youtube_dl/extractor/nationalgeographic.py54
-rw-r--r--youtube_dl/extractor/naver.py33
-rw-r--r--youtube_dl/extractor/nba.py20
-rw-r--r--youtube_dl/extractor/nbc.py141
-rw-r--r--youtube_dl/extractor/ndr.py419
-rw-r--r--youtube_dl/extractor/neteasemusic.py459
-rw-r--r--youtube_dl/extractor/netzkino.py5
-rw-r--r--youtube_dl/extractor/newstube.py2
-rw-r--r--youtube_dl/extractor/nextmedia.py57
-rw-r--r--youtube_dl/extractor/nfb.py11
-rw-r--r--youtube_dl/extractor/nfl.py160
-rw-r--r--youtube_dl/extractor/nhl.py63
-rw-r--r--youtube_dl/extractor/niconico.py168
-rw-r--r--youtube_dl/extractor/ninegag.py95
-rw-r--r--youtube_dl/extractor/noco.py146
-rw-r--r--youtube_dl/extractor/nosvideo.py6
-rw-r--r--youtube_dl/extractor/nova.py179
-rw-r--r--youtube_dl/extractor/novamov.py47
-rw-r--r--youtube_dl/extractor/nowness.py170
-rw-r--r--youtube_dl/extractor/nowtv.py257
-rw-r--r--youtube_dl/extractor/nowvideo.py4
-rw-r--r--youtube_dl/extractor/npo.py186
-rw-r--r--youtube_dl/extractor/nrk.py136
-rw-r--r--youtube_dl/extractor/nuvid.py6
-rw-r--r--youtube_dl/extractor/nytimes.py108
-rw-r--r--youtube_dl/extractor/odnoklassniki.py152
-rw-r--r--youtube_dl/extractor/onionstudios.py76
-rw-r--r--youtube_dl/extractor/ooyala.py183
-rw-r--r--youtube_dl/extractor/openfilm.py70
-rw-r--r--youtube_dl/extractor/orf.py94
-rw-r--r--youtube_dl/extractor/patreon.py45
-rw-r--r--youtube_dl/extractor/pbs.py205
-rw-r--r--youtube_dl/extractor/periscope.py82
-rw-r--r--youtube_dl/extractor/philharmoniedeparis.py78
-rw-r--r--youtube_dl/extractor/phoenix.py40
-rw-r--r--youtube_dl/extractor/photobucket.py4
-rw-r--r--youtube_dl/extractor/pinkbike.py96
-rw-r--r--youtube_dl/extractor/pladform.py90
-rw-r--r--youtube_dl/extractor/planetaplay.py3
-rw-r--r--youtube_dl/extractor/played.py12
-rw-r--r--youtube_dl/extractor/playfm.py87
-rw-r--r--youtube_dl/extractor/playtvak.py181
-rw-r--r--youtube_dl/extractor/playvid.py7
-rw-r--r--youtube_dl/extractor/playwire.py78
-rw-r--r--youtube_dl/extractor/pluralsight.py275
-rw-r--r--youtube_dl/extractor/porn91.py73
-rw-r--r--youtube_dl/extractor/pornhd.py3
-rw-r--r--youtube_dl/extractor/pornhub.py85
-rw-r--r--youtube_dl/extractor/pornotube.py10
-rw-r--r--youtube_dl/extractor/pornovoisines.py96
-rw-r--r--youtube_dl/extractor/primesharetv.py62
-rw-r--r--youtube_dl/extractor/promptfile.py13
-rw-r--r--youtube_dl/extractor/prosiebensat1.py47
-rw-r--r--youtube_dl/extractor/puls4.py88
-rw-r--r--youtube_dl/extractor/qqmusic.py344
-rw-r--r--youtube_dl/extractor/quickvid.py1
-rw-r--r--youtube_dl/extractor/r7.py88
-rw-r--r--youtube_dl/extractor/radiode.py15
-rw-r--r--youtube_dl/extractor/radiojavan.py67
-rw-r--r--youtube_dl/extractor/rai.py113
-rw-r--r--youtube_dl/extractor/rds.py73
-rw-r--r--youtube_dl/extractor/redtube.py9
-rw-r--r--youtube_dl/extractor/rtbf.py63
-rw-r--r--youtube_dl/extractor/rte.py12
-rw-r--r--youtube_dl/extractor/rtl2.py27
-rw-r--r--youtube_dl/extractor/rtlnl.py111
-rw-r--r--youtube_dl/extractor/rtlnow.py174
-rw-r--r--youtube_dl/extractor/rtp.py4
-rw-r--r--youtube_dl/extractor/rts.py32
-rw-r--r--youtube_dl/extractor/rtve.py89
-rw-r--r--youtube_dl/extractor/rtvnh.py47
-rw-r--r--youtube_dl/extractor/rutube.py35
-rw-r--r--youtube_dl/extractor/rutv.py31
-rw-r--r--youtube_dl/extractor/ruutu.py103
-rw-r--r--youtube_dl/extractor/safari.py156
-rw-r--r--youtube_dl/extractor/sandia.py115
-rw-r--r--youtube_dl/extractor/sbs.py44
-rw-r--r--youtube_dl/extractor/screenwavemedia.py172
-rw-r--r--youtube_dl/extractor/senateisvp.py145
-rw-r--r--youtube_dl/extractor/sexykarma.py1
-rw-r--r--youtube_dl/extractor/shahid.py107
-rw-r--r--youtube_dl/extractor/shared.py31
-rw-r--r--youtube_dl/extractor/sharesix.py8
-rw-r--r--youtube_dl/extractor/sina.py8
-rw-r--r--youtube_dl/extractor/slideshare.py2
-rw-r--r--youtube_dl/extractor/smotri.py70
-rw-r--r--youtube_dl/extractor/snagfilms.py181
-rw-r--r--youtube_dl/extractor/sockshare.py84
-rw-r--r--youtube_dl/extractor/sohu.py141
-rw-r--r--youtube_dl/extractor/soompi.py146
-rw-r--r--youtube_dl/extractor/soundcloud.py226
-rw-r--r--youtube_dl/extractor/soundgasm.py24
-rw-r--r--youtube_dl/extractor/southpark.py48
-rw-r--r--youtube_dl/extractor/space.py8
-rw-r--r--youtube_dl/extractor/spankbang.py60
-rw-r--r--youtube_dl/extractor/spankwire.py72
-rw-r--r--youtube_dl/extractor/spiegel.py5
-rw-r--r--youtube_dl/extractor/spiegeltv.py56
-rw-r--r--youtube_dl/extractor/spike.py2
-rw-r--r--youtube_dl/extractor/sportbox.py131
-rw-r--r--youtube_dl/extractor/sportdeutschland.py16
-rw-r--r--youtube_dl/extractor/srf.py104
-rw-r--r--youtube_dl/extractor/ssa.py58
-rw-r--r--youtube_dl/extractor/stitcher.py81
-rw-r--r--youtube_dl/extractor/streamcloud.py8
-rw-r--r--youtube_dl/extractor/streamcz.py21
-rw-r--r--youtube_dl/extractor/subtitles.py99
-rw-r--r--youtube_dl/extractor/sunporno.py4
-rw-r--r--youtube_dl/extractor/svt.py117
-rw-r--r--youtube_dl/extractor/svtplay.py56
-rw-r--r--youtube_dl/extractor/tagesschau.py75
-rw-r--r--youtube_dl/extractor/tapely.py12
-rw-r--r--youtube_dl/extractor/teamcoco.py149
-rw-r--r--youtube_dl/extractor/ted.py112
-rw-r--r--youtube_dl/extractor/telecinco.py87
-rw-r--r--youtube_dl/extractor/telegraaf.py35
-rw-r--r--youtube_dl/extractor/tenplay.py27
-rw-r--r--youtube_dl/extractor/testtube.py20
-rw-r--r--youtube_dl/extractor/tf1.py12
-rw-r--r--youtube_dl/extractor/theonion.py17
-rw-r--r--youtube_dl/extractor/theplatform.py353
-rw-r--r--youtube_dl/extractor/thesixtyone.py18
-rw-r--r--youtube_dl/extractor/thisamericanlife.py40
-rw-r--r--youtube_dl/extractor/tlc.py21
-rw-r--r--youtube_dl/extractor/tmz.py28
-rw-r--r--youtube_dl/extractor/tnaflix.py271
-rw-r--r--youtube_dl/extractor/tube8.py22
-rw-r--r--youtube_dl/extractor/tubitv.py80
-rw-r--r--youtube_dl/extractor/tudou.py45
-rw-r--r--youtube_dl/extractor/tumblr.py89
-rw-r--r--youtube_dl/extractor/turbo.py4
-rw-r--r--youtube_dl/extractor/tutv.py6
-rw-r--r--youtube_dl/extractor/tv2.py126
-rw-r--r--youtube_dl/extractor/tv4.py100
-rw-r--r--youtube_dl/extractor/tvc.py109
-rw-r--r--youtube_dl/extractor/tvigle.py39
-rw-r--r--youtube_dl/extractor/tvplay.py21
-rw-r--r--youtube_dl/extractor/tweakers.py50
-rw-r--r--youtube_dl/extractor/twentyfourvideo.py4
-rw-r--r--youtube_dl/extractor/twentytwotracks.py86
-rw-r--r--youtube_dl/extractor/twitch.py184
-rw-r--r--youtube_dl/extractor/twitter.py228
-rw-r--r--youtube_dl/extractor/udemy.py49
-rw-r--r--youtube_dl/extractor/udn.py76
-rw-r--r--youtube_dl/extractor/ultimedia.py105
-rw-r--r--youtube_dl/extractor/ustream.py85
-rw-r--r--youtube_dl/extractor/varzesh3.py45
-rw-r--r--youtube_dl/extractor/vbox7.py25
-rw-r--r--youtube_dl/extractor/veehd.py39
-rw-r--r--youtube_dl/extractor/veoh.py6
-rw-r--r--youtube_dl/extractor/vessel.py133
-rw-r--r--youtube_dl/extractor/vevo.py12
-rw-r--r--youtube_dl/extractor/vgtv.py114
-rw-r--r--youtube_dl/extractor/vice.py38
-rw-r--r--youtube_dl/extractor/viddler.py6
-rw-r--r--youtube_dl/extractor/videobam.py81
-rw-r--r--youtube_dl/extractor/videofyme.py40
-rw-r--r--youtube_dl/extractor/videolecturesnet.py70
-rw-r--r--youtube_dl/extractor/videomega.py63
-rw-r--r--youtube_dl/extractor/videott.py2
-rw-r--r--youtube_dl/extractor/vidme.py196
-rw-r--r--youtube_dl/extractor/vidzi.py10
-rw-r--r--youtube_dl/extractor/vier.py22
-rw-r--r--youtube_dl/extractor/viewster.py172
-rw-r--r--youtube_dl/extractor/viidea.py188
-rw-r--r--youtube_dl/extractor/viki.py370
-rw-r--r--youtube_dl/extractor/vimeo.py301
-rw-r--r--youtube_dl/extractor/vimple.py91
-rw-r--r--youtube_dl/extractor/vine.py110
-rw-r--r--youtube_dl/extractor/vk.py187
-rw-r--r--youtube_dl/extractor/vlive.py86
-rw-r--r--youtube_dl/extractor/vodlocker.py17
-rw-r--r--youtube_dl/extractor/voicerepublic.py97
-rw-r--r--youtube_dl/extractor/vporn.py30
-rw-r--r--youtube_dl/extractor/vube.py1
-rw-r--r--youtube_dl/extractor/vuclip.py2
-rw-r--r--youtube_dl/extractor/vulture.py2
-rw-r--r--youtube_dl/extractor/walla.py15
-rw-r--r--youtube_dl/extractor/washingtonpost.py8
-rw-r--r--youtube_dl/extractor/wat.py2
-rw-r--r--youtube_dl/extractor/wdr.py30
-rw-r--r--youtube_dl/extractor/webofstories.py63
-rw-r--r--youtube_dl/extractor/wimp.py26
-rw-r--r--youtube_dl/extractor/wistia.py8
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py23
-rw-r--r--youtube_dl/extractor/wsj.py5
-rw-r--r--youtube_dl/extractor/xbef.py6
-rw-r--r--youtube_dl/extractor/xfileshare.py (renamed from youtube_dl/extractor/gorillavid.py)76
-rw-r--r--youtube_dl/extractor/xhamster.py73
-rw-r--r--youtube_dl/extractor/xminus.py4
-rw-r--r--youtube_dl/extractor/xnxx.py6
-rw-r--r--youtube_dl/extractor/xstream.py115
-rw-r--r--youtube_dl/extractor/xtube.py14
-rw-r--r--youtube_dl/extractor/xuite.py17
-rw-r--r--youtube_dl/extractor/xvideos.py33
-rw-r--r--youtube_dl/extractor/yahoo.py103
-rw-r--r--youtube_dl/extractor/yam.py123
-rw-r--r--youtube_dl/extractor/yandexmusic.py178
-rw-r--r--youtube_dl/extractor/yinyuetai.py56
-rw-r--r--youtube_dl/extractor/ynet.py4
-rw-r--r--youtube_dl/extractor/youku.py319
-rw-r--r--youtube_dl/extractor/youporn.py217
-rw-r--r--youtube_dl/extractor/yourupload.py25
-rw-r--r--youtube_dl/extractor/youtube.py1300
-rw-r--r--youtube_dl/extractor/zapiks.py110
-rw-r--r--youtube_dl/extractor/zdf.py50
-rw-r--r--youtube_dl/extractor/zingmp3.py21
403 files changed, 25077 insertions, 6832 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index a4fab540b..947b83683 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -4,10 +4,14 @@ from .abc import ABCIE
from .abc7news import Abc7NewsIE
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
-from .adobetv import AdobeTVIE
+from .adobetv import (
+ AdobeTVIE,
+ AdobeTVVideoIE,
+)
from .adultswim import AdultSwimIE
from .aftenposten import AftenpostenIE
from .aftonbladet import AftonbladetIE
+from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
from .anitube import AnitubeIE
@@ -15,9 +19,14 @@ from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
from .aparat import AparatIE
+from .appleconnect import AppleConnectIE
from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
-from .ard import ARDIE, ARDMediathekIE
+from .ard import (
+ ARDIE,
+ ARDMediathekIE,
+ SportschauIE,
+)
from .arte import (
ArteTvIE,
ArteTVPlus7IE,
@@ -31,11 +40,17 @@ from .atresplayer import AtresPlayerIE
from .atttechchannel import ATTTechChannelIE
from .audiomack import AudiomackIE, AudiomackAlbumIE
from .azubu import AzubuIE
+from .baidu import BaiduVideoIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
-from .bbccouk import BBCCoUkIE
+from .bbc import (
+ BBCCoUkIE,
+ BBCCoUkArticleIE,
+ BBCIE,
+)
from .beeg import BeegIE
from .behindkink import BehindKinkIE
+from .beatportpro import BeatportProIE
from .bet import BetIE
from .bild import BildIE
from .bilibili import BiliBiliIE
@@ -45,7 +60,10 @@ from .bloomberg import BloombergIE
from .bpb import BpbIE
from .br import BRIE
from .breakcom import BreakIE
-from .brightcove import BrightcoveIE
+from .brightcove import (
+ BrightcoveLegacyIE,
+ BrightcoveNewIE,
+)
from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE
from .c56 import C56IE
@@ -58,16 +76,24 @@ from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
from .cbsnews import CBSNewsIE
+from .cbssports import CBSSportsIE
from .ccc import CCCIE
from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
+from .chaturbate import ChaturbateIE
from .chilloutzone import ChilloutzoneIE
+from .chirbit import (
+ ChirbitIE,
+ ChirbitProfileIE,
+)
from .cinchcast import CinchcastIE
+from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
from .cloudy import CloudyIE
from .clubic import ClubicIE
+from .clyp import ClypIE
from .cmt import CMTIE
from .cnet import CNETIE
from .cnn import (
@@ -83,6 +109,7 @@ from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
from .condenast import CondeNastIE
from .cracked import CrackedIE
from .criterion import CriterionIE
+from .crooksandliars import CrooksAndLiarsIE
from .crunchyroll import (
CrunchyrollIE,
CrunchyrollShowPlaylistIE
@@ -93,23 +120,34 @@ from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
DailymotionUserIE,
+ DailymotionCloudIE,
)
from .daum import DaumIE
from .dbtv import DBTVIE
+from .dcn import DCNIE
from .dctp import DctpTvIE
from .deezer import DeezerPlaylistIE
+from .democracynow import DemocracynowIE
from .dfb import DFBIE
+from .dhm import DHMIE
from .dotsub import DotsubIE
+from .douyutv import DouyuTVIE
+from .dplay import DPlayIE
+from .dramafever import (
+ DramaFeverIE,
+ DramaFeverSeriesIE,
+)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
from .drtv import DRTVIE
from .dvtv import DVTVIE
from .dump import DumpIE
+from .dumpert import DumpertIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
-from .divxstage import DivxStageIE
from .dropbox import DropboxIE
+from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE
from .ehow import EHowIE
@@ -121,11 +159,14 @@ from .ellentv import (
EllenTVClipsIE,
)
from .elpais import ElPaisIE
-from .empflix import EMPFlixIE
+from .embedly import EmbedlyIE
from .engadget import EngadgetIE
from .eporner import EpornerIE
from .eroprofile import EroProfileIE
from .escapist import EscapistIE
+from .espn import ESPNIE
+from .esri import EsriVideoIE
+from .europa import EuropaIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .expotv import ExpoTVIE
@@ -133,19 +174,19 @@ from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
from .fc2 import FC2IE
-from .firedrive import FiredriveIE
+from .fczenit import FczenitIE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
-from .fktv import (
- FKTVIE,
- FKTVPosteckeIE,
-)
+from .fivetv import FiveTVIE
+from .fktv import FKTVIE
from .flickr import FlickrIE
from .folketinget import FolketingetIE
+from .footyroom import FootyRoomIE
from .fourtube import FourTubeIE
from .foxgay import FoxgayIE
from .foxnews import FoxNewsIE
+from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
@@ -164,23 +205,27 @@ from .gameone import (
GameOneIE,
GameOnePlaylistIE,
)
+from .gamersyde import GamersydeIE
from .gamespot import GameSpotIE
from .gamestar import GameStarIE
from .gametrailers import GametrailersIE
+from .gazeta import GazetaIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE
+from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
from .giga import GigaIE
from .glide import GlideIE
-from .globo import GloboIE
+from .globo import (
+ GloboIE,
+ GloboArticleIE,
+)
from .godtube import GodTubeIE
from .goldenmoustache import GoldenMoustacheIE
from .golem import GolemIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
-from .gorillavid import GorillaVidIE
from .goshgay import GoshgayIE
-from .grooveshark import GroovesharkIE
from .groupon import GrouponIE
from .hark import HarkIE
from .hearthisat import HearThisAtIE
@@ -189,9 +234,9 @@ from .hellporno import HellPornoIE
from .helsinki import HelsinkiIE
from .hentaistigma import HentaiStigmaIE
from .historicfilms import HistoricFilmsIE
+from .history import HistoryIE
from .hitbox import HitboxIE, HitboxLiveIE
from .hornbunny import HornBunnyIE
-from .hostingbulk import HostingBulkIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
@@ -203,11 +248,21 @@ from .imdb import (
ImdbIE,
ImdbListIE
)
+from .imgur import (
+ ImgurIE,
+ ImgurAlbumIE,
+)
from .ina import InaIE
+from .indavideo import (
+ IndavideoIE,
+ IndavideoEmbedIE,
+)
from .infoq import InfoQIE
from .instagram import InstagramIE, InstagramUserIE
from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
+from .ir90tv import Ir90TvIE
from .ivi import (
IviIE,
IviCompilationIE
@@ -218,8 +273,11 @@ from .jeuxvideo import JeuxVideoIE
from .jove import JoveIE
from .jukebox import JukeboxIE
from .jpopsukitv import JpopsukiIE
+from .kaltura import KalturaIE
+from .kanalplay import KanalPlayIE
from .kankan import KankanIE
from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
@@ -227,9 +285,32 @@ from .keek import KeekIE
from .kontrtube import KontrTubeIE
from .krasview import KrasViewIE
from .ku6 import Ku6IE
+from .kuwo import (
+ KuwoIE,
+ KuwoAlbumIE,
+ KuwoChartIE,
+ KuwoSingerIE,
+ KuwoCategoryIE,
+ KuwoMvIE,
+)
from .la7 import LA7IE
from .laola1tv import Laola1TvIE
-from .lifenews import LifeNewsIE
+from .lecture2go import Lecture2GoIE
+from .letv import (
+ LetvIE,
+ LetvTvIE,
+ LetvPlaylistIE
+)
+from .libsyn import LibsynIE
+from .lifenews import (
+ LifeNewsIE,
+ LifeEmbedIE,
+)
+from .limelight import (
+ LimelightMediaIE,
+ LimelightChannelIE,
+ LimelightChannelListIE,
+)
from .liveleak import LiveLeakIE
from .livestream import (
LivestreamIE,
@@ -252,6 +333,7 @@ from .metacritic import MetacriticIE
from .mgoon import MgoonIE
from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE
+from .miomio import MioMioIE
from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mitele import MiTeleIE
from .mixcloud import MixcloudIE
@@ -272,33 +354,53 @@ from .mtv import (
MTVIE,
MTVServicesEmbeddedIE,
MTVIggyIE,
+ MTVDEIE,
)
from .muenchentv import MuenchenTVIE
from .musicplayon import MusicPlayOnIE
-from .musicvault import MusicVaultIE
from .muzu import MuzuTVIE
+from .mwave import MwaveIE
from .myspace import MySpaceIE, MySpaceAlbumIE
from .myspass import MySpassIE
+from .myvi import MyviIE
from .myvideo import MyVideoIE
from .myvidster import MyVidsterIE
+from .nationalgeographic import NationalGeographicIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
NBCIE,
NBCNewsIE,
+ NBCSportsIE,
+ NBCSportsVPlayerIE,
+ MSNBCIE,
+)
+from .ndr import (
+ NDRIE,
+ NJoyIE,
+ NDREmbedBaseIE,
+ NDREmbedIE,
+ NJoyEmbedIE,
)
-from .ndr import NDRIE
from .ndtv import NDTVIE
from .netzkino import NetzkinoIE
from .nerdcubed import NerdCubedFeedIE
from .nerdist import NerdistIE
+from .neteasemusic import (
+ NetEaseMusicIE,
+ NetEaseMusicAlbumIE,
+ NetEaseMusicSingerIE,
+ NetEaseMusicListIE,
+ NetEaseMusicMvIE,
+ NetEaseMusicProgramIE,
+ NetEaseMusicDjRadioIE,
+)
from .newgrounds import NewgroundsIE
from .newstube import NewstubeIE
from .nextmedia import (
NextMediaIE,
NextMediaActionNewsIE,
- AppleDailyRealtimeNewsIE,
- AppleDailyAnimationNewsIE
+ AppleDailyIE,
)
from .nfb import NFBIE
from .nfl import NFLIE
@@ -312,55 +414,101 @@ from .ninegag import NineGagIE
from .noco import NocoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
+from .nova import NovaIE
from .novamov import NovaMovIE
-from .nowness import NownessIE
+from .nowness import (
+ NownessIE,
+ NownessPlaylistIE,
+ NownessSeriesIE,
+)
+from .nowtv import (
+ NowTVIE,
+ NowTVListIE,
+)
from .nowvideo import NowVideoIE
from .npo import (
NPOIE,
NPOLiveIE,
NPORadioIE,
NPORadioFragmentIE,
- TegenlichtVproIE,
+ VPROIE,
+ WNLIE
)
from .nrk import (
NRKIE,
+ NRKPlaylistIE,
NRKTVIE,
)
from .ntvde import NTVDeIE
from .ntvru import NTVRuIE
-from .nytimes import NYTimesIE
+from .nytimes import (
+ NYTimesIE,
+ NYTimesArticleIE,
+)
from .nuvid import NuvidIE
+from .odnoklassniki import OdnoklassnikiIE
from .oktoberfesttv import OktoberfestTVIE
-from .ooyala import OoyalaIE
-from .openfilm import OpenFilmIE
+from .onionstudios import OnionStudiosIE
+from .ooyala import (
+ OoyalaIE,
+ OoyalaExternalIE,
+)
from .orf import (
ORFTVthekIE,
ORFOE1IE,
ORFFM4IE,
+ ORFIPTVIE,
)
from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
+from .periscope import PeriscopeIE
+from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE
+from .pinkbike import PinkbikeIE
from .planetaplay import PlanetaPlayIE
+from .pladform import PladformIE
from .played import PlayedIE
from .playfm import PlayFMIE
+from .playtvak import PlaytvakIE
from .playvid import PlayvidIE
+from .playwire import PlaywireIE
+from .pluralsight import (
+ PluralsightIE,
+ PluralsightCourseIE,
+)
from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
from .pornhd import PornHdIE
-from .pornhub import PornHubIE
+from .pornhub import (
+ PornHubIE,
+ PornHubPlaylistIE,
+)
from .pornotube import PornotubeIE
+from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
+from .primesharetv import PrimeShareTVIE
from .promptfile import PromptFileIE
from .prosiebensat1 import ProSiebenSat1IE
+from .puls4 import Puls4IE
from .pyvideo import PyvideoIE
+from .qqmusic import (
+ QQMusicIE,
+ QQMusicSingerIE,
+ QQMusicAlbumIE,
+ QQMusicToplistIE,
+ QQMusicPlaylistIE,
+)
from .quickvid import QuickVidIE
+from .r7 import R7IE
from .radiode import RadioDeIE
+from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .rai import RaiIE
from .rbmaradio import RBMARadioIE
+from .rds import RDSIE
from .redtube import RedTubeIE
from .restudy import RestudyIE
from .reverbnation import ReverbNationIE
@@ -370,12 +518,12 @@ from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
from .rtbf import RTBFIE
from .rte import RteIE
-from .rtlnl import RtlXlIE
-from .rtlnow import RTLnowIE
+from .rtlnl import RtlNlIE
from .rtl2 import RTL2IE
from .rtp import RTPIE
from .rts import RTSIE
-from .rtve import RTVEALaCartaIE, RTVELiveIE
+from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE
+from .rtvnh import RTVNHIE
from .ruhd import RUHDIE
from .rutube import (
RutubeIE,
@@ -385,16 +533,24 @@ from .rutube import (
RutubePersonIE,
)
from .rutv import RUTVIE
+from .ruutu import RuutuIE
+from .sandia import SandiaIE
+from .safari import (
+ SafariIE,
+ SafariCourseIE,
+)
from .sapo import SapoIE
from .savefrom import SaveFromIE
from .sbs import SBSIE
from .scivee import SciVeeIE
from .screencast import ScreencastIE
from .screencastomatic import ScreencastOMaticIE
-from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
+from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
+from .senateisvp import SenateISVPIE
from .servingsys import ServingSysIE
from .sexu import SexuIE
from .sexykarma import SexyKarmaIE
+from .shahid import ShahidIE
from .shared import SharedIE
from .sharesix import ShareSixIE
from .sina import SinaIE
@@ -406,36 +562,60 @@ from .smotri import (
SmotriUserIE,
SmotriBroadcastIE,
)
+from .snagfilms import (
+ SnagFilmsIE,
+ SnagFilmsEmbedIE,
+)
from .snotr import SnotrIE
-from .sockshare import SockshareIE
from .sohu import SohuIE
+from .soompi import (
+ SoompiIE,
+ SoompiShowIE,
+)
from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
SoundcloudUserIE,
- SoundcloudPlaylistIE
+ SoundcloudPlaylistIE,
+ SoundcloudSearchIE
+)
+from .soundgasm import (
+ SoundgasmIE,
+ SoundgasmProfileIE
)
-from .soundgasm import SoundgasmIE
from .southpark import (
SouthParkIE,
- SouthparkDeIE,
+ SouthParkDeIE,
+ SouthParkDkIE,
+ SouthParkEsIE,
+ SouthParkNlIE
)
from .space import SpaceIE
+from .spankbang import SpankBangIE
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
+from .stitcher import StitcherIE
from .sport5 import Sport5IE
-from .sportbox import SportBoxIE
+from .sportbox import (
+ SportBoxIE,
+ SportBoxEmbedIE,
+)
from .sportdeutschland import SportDeutschlandIE
+from .srf import SrfIE
from .srmediathek import SRMediathekIE
+from .ssa import SSAIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
from .sunporno import SunPornoIE
-from .svtplay import SVTPlayIE
+from .svt import (
+ SVTIE,
+ SVTPlayIE,
+)
from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
@@ -452,6 +632,7 @@ from .techtalks import TechTalksIE
from .ted import TEDIE
from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
+from .telegraaf import TelegraafIE
from .telemb import TeleMBIE
from .teletask import TeleTaskIE
from .tenplay import TenPlayIE
@@ -459,13 +640,24 @@ from .testurl import TestURLIE
from .testtube import TestTubeIE
from .tf1 import TF1IE
from .theonion import TheOnionIE
-from .theplatform import ThePlatformIE
+from .theplatform import (
+ ThePlatformIE,
+ ThePlatformFeedIE,
+)
from .thesixtyone import TheSixtyOneIE
+from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
from .tlc import TlcIE, TlcDeIE
-from .tmz import TMZIE
-from .tnaflix import TNAFlixIE
+from .tmz import (
+ TMZIE,
+ TMZArticleIE,
+)
+from .tnaflix import (
+ TNAFlixIE,
+ EMPFlixIE,
+ MovieFapIE,
+)
from .thvideo import (
THVideoIE,
THVideoPlaylistIE
@@ -476,16 +668,30 @@ from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .trutube import TruTubeIE
from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tunein import TuneInIE
from .turbo import TurboIE
from .tutv import TutvIE
+from .tv2 import (
+ TV2IE,
+ TV2ArticleIE,
+)
+from .tv4 import TV4IE
+from .tvc import (
+ TVCIE,
+ TVCArticleIE,
+)
from .tvigle import TvigleIE
from .tvp import TvpIE, TvpSeriesIE
from .tvplay import TVPlayIE
from .tweakers import TweakersIE
from .twentyfourvideo import TwentyFourVideoIE
+from .twentytwotracks import (
+ TwentyTwoTracksIE,
+ TwentyTwoTracksGenreIE
+)
from .twitch import (
TwitchVideoIE,
TwitchChapterIE,
@@ -495,26 +701,33 @@ from .twitch import (
TwitchBookmarksIE,
TwitchStreamIE,
)
+from .twitter import TwitterCardIE, TwitterIE
from .ubu import UbuIE
from .udemy import (
UdemyIE,
UdemyCourseIE
)
+from .udn import UDNEmbedIE
+from .ultimedia import UltimediaIE
from .unistra import UnistraIE
from .urort import UrortIE
from .ustream import UstreamIE, UstreamChannelIE
+from .varzesh3 import Varzesh3IE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
+from .vessel import VesselIE
from .vesti import VestiIE
from .vevo import VevoIE
-from .vgtv import VGTVIE
+from .vgtv import (
+ BTArticleIE,
+ BTVestlendingenIE,
+ VGTVIE,
+)
from .vh1 import VH1IE
from .vice import ViceIE
from .viddler import ViddlerIE
-from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
-from .videolecturesnet import VideoLecturesNetIE
from .videofyme import VideofyMeIE
from .videomega import VideoMegaIE
from .videopremium import VideoPremiumIE
@@ -523,6 +736,8 @@ from .videoweed import VideoWeedIE
from .vidme import VidmeIE
from .vidzi import VidziIE
from .vier import VierIE, VierVideosIE
+from .viewster import ViewsterIE
+from .viidea import ViideaIE
from .vimeo import (
VimeoIE,
VimeoAlbumIE,
@@ -538,12 +753,17 @@ from .vine import (
VineIE,
VineUserIE,
)
-from .viki import VikiIE
+from .viki import (
+ VikiIE,
+ VikiChannelIE,
+)
from .vk import (
VKIE,
VKUserVideosIE,
)
+from .vlive import VLiveIE
from .vodlocker import VodlockerIE
+from .voicerepublic import VoiceRepublicIE
from .vporn import VpornIE
from .vrt import VRTIE
from .vube import VubeIE
@@ -558,7 +778,10 @@ from .wdr import (
WDRMobileIE,
WDRMausIE,
)
-from .webofstories import WebOfStoriesIE
+from .webofstories import (
+ WebOfStoriesIE,
+ WebOfStoriesPlaylistIE,
+)
from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
@@ -567,18 +790,30 @@ from .wrzuta import WrzutaIE
from .wsj import WSJIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
-from .xhamster import XHamsterIE
+from .xfileshare import XFileShareIE
+from .xhamster import (
+ XHamsterIE,
+ XHamsterEmbedIE,
+)
from .xminus import XMinusIE
from .xnxx import XNXXIE
-from .xvideos import XVideosIE
+from .xstream import XstreamIE
from .xtube import XTubeUserIE, XTubeIE
from .xuite import XuiteIE
+from .xvideos import XVideosIE
from .xxxymovies import XXXYMoviesIE
from .yahoo import (
YahooIE,
YahooSearchIE,
)
+from .yam import YamIE
+from .yandexmusic import (
+ YandexMusicTrackIE,
+ YandexMusicAlbumIE,
+ YandexMusicPlaylistIE,
+)
from .yesjapan import YesJapanIE
+from .yinyuetai import YinYueTaiIE
from .ynet import YnetIE
from .youjizz import YouJizzIE
from .youku import YoukuIE
@@ -599,8 +834,10 @@ from .youtube import (
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
+ YoutubeUserPlaylistsIE,
YoutubeWatchLaterIE,
)
+from .zapiks import ZapiksIE
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import (
ZingMp3SongIE,
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py
index dc0fb85d6..c0e5d1abf 100644
--- a/youtube_dl/extractor/abc.py
+++ b/youtube_dl/extractor/abc.py
@@ -1,16 +1,20 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ js_to_json,
+ int_or_none,
+)
class ABCIE(InfoExtractor):
IE_NAME = 'abc.net.au'
- _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)'
+ _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
'md5': 'cb3dd03b18455a661071ee1e28344d9f',
'info_dict': {
@@ -19,23 +23,62 @@ class ABCIE(InfoExtractor):
'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
},
- }
+ }, {
+ 'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',
+ 'md5': 'db2a5369238b51f9811ad815b69dc086',
+ 'info_dict': {
+ 'id': 'NvqvPeNZsHU',
+ 'ext': 'mp4',
+ 'upload_date': '20150816',
+ 'uploader': 'ABC News (Australia)',
+ 'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef',
+ 'uploader_id': 'NewsOnABC',
+ 'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080',
+ 'md5': 'b96eee7c9edf4fc5a358a0252881cc1f',
+ 'info_dict': {
+ 'id': '6880080',
+ 'ext': 'mp3',
+ 'title': 'NAB lifts interest rates, following Westpac and CBA',
+ 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728',
+ },
+ }, {
+ 'url': 'http://www.abc.net.au/news/2015-10-19/6866214',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- urls_info_json = self._search_regex(
- r'inlineVideoData\.push\((.*?)\);', webpage, 'video urls',
- flags=re.DOTALL)
- urls_info = json.loads(urls_info_json.replace('\'', '"'))
+ mobj = re.search(
+ r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
+ webpage)
+ if mobj is None:
+ raise ExtractorError('Unable to extract video urls')
+
+ urls_info = self._parse_json(
+ mobj.group('json_data'), video_id, transform_source=js_to_json)
+
+ if not isinstance(urls_info, list):
+ urls_info = [urls_info]
+
+ if mobj.group('type') == 'YouTube':
+ return self.playlist_result([
+ self.url_result(url_info['url']) for url_info in urls_info])
+
formats = [{
'url': url_info['url'],
- 'width': int(url_info['width']),
- 'height': int(url_info['height']),
- 'tbr': int(url_info['bitrate']),
- 'filesize': int(url_info['filesize']),
+ 'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none',
+ 'width': int_or_none(url_info.get('width')),
+ 'height': int_or_none(url_info.get('height')),
+ 'tbr': int_or_none(url_info.get('bitrate')),
+ 'filesize': int_or_none(url_info.get('filesize')),
} for url_info in urls_info]
+
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py
index 47313fba8..34095501c 100644
--- a/youtube_dl/extractor/academicearth.py
+++ b/youtube_dl/extractor/academicearth.py
@@ -15,7 +15,7 @@ class AcademicEarthCourseIE(InfoExtractor):
'title': 'Laws of Nature',
'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.',
},
- 'playlist_count': 4,
+ 'playlist_count': 3,
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index 203936e54..e3e6d2113 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -11,12 +11,13 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ qualities,
)
class AddAnimeIE(InfoExtractor):
- _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<id>[\w_]+)(?:.*)'
- _TEST = {
+ _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)'
+ _TESTS = [{
'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
'md5': '72954ea10bc979ab5e2eb288b21425a0',
'info_dict': {
@@ -25,7 +26,10 @@ class AddAnimeIE(InfoExtractor):
'description': 'One Piece 606',
'title': 'One Piece 606',
}
- }
+ }, {
+ 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -63,8 +67,10 @@ class AddAnimeIE(InfoExtractor):
note='Confirming after redirect')
webpage = self._download_webpage(url, video_id)
+ FORMATS = ('normal', 'hq')
+ quality = qualities(FORMATS)
formats = []
- for format_id in ('normal', 'hq'):
+ for format_id in FORMATS:
rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id)
video_url = self._search_regex(rex, webpage, 'video file URLx',
fatal=False)
@@ -73,6 +79,7 @@ class AddAnimeIE(InfoExtractor):
formats.append({
'format_id': format_id,
'url': video_url,
+ 'quality': quality(format_id),
})
self._sort_formats(formats)
video_title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
index 28e07f8b0..5e43adc51 100644
--- a/youtube_dl/extractor/adobetv.py
+++ b/youtube_dl/extractor/adobetv.py
@@ -5,6 +5,8 @@ from ..utils import (
parse_duration,
unified_strdate,
str_to_int,
+ float_or_none,
+ ISO639Utils,
)
@@ -28,7 +30,6 @@ class AdobeTVIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
webpage = self._download_webpage(url, video_id)
player = self._parse_json(
@@ -44,8 +45,10 @@ class AdobeTVIE(InfoExtractor):
self._html_search_meta('datepublished', webpage, 'upload date'))
duration = parse_duration(
- self._html_search_meta('duration', webpage, 'duration')
- or self._search_regex(r'Runtime:\s*(\d{2}:\d{2}:\d{2})', webpage, 'duration'))
+ self._html_search_meta('duration', webpage, 'duration') or
+ self._search_regex(
+ r'Runtime:\s*(\d{2}:\d{2}:\d{2})',
+ webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex(
r'<div class="views">\s*Views?:\s*([\d,.]+)\s*</div>',
@@ -68,3 +71,61 @@ class AdobeTVIE(InfoExtractor):
'view_count': view_count,
'formats': formats,
}
+
+
+class AdobeTVVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+
+ _TEST = {
+ # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
+ 'url': 'https://video.tv.adobe.com/v/2456/',
+ 'md5': '43662b577c018ad707a63766462b1e87',
+ 'info_dict': {
+ 'id': '2456',
+ 'ext': 'mp4',
+ 'title': 'New experience with Acrobat DC',
+ 'description': 'New experience with Acrobat DC',
+ 'duration': 248.667,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ player_params = self._parse_json(self._search_regex(
+ r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'),
+ video_id)
+
+ formats = [{
+ 'url': source['src'],
+ 'width': source.get('width'),
+ 'height': source.get('height'),
+ 'tbr': source.get('bitrate'),
+ } for source in player_params['sources']]
+
+ # For both metadata and downloaded files the duration varies among
+ # formats. I just pick the max one
+ duration = max(filter(None, [
+ float_or_none(source.get('duration'), scale=1000)
+ for source in player_params['sources']]))
+
+ subtitles = {}
+ for translation in player_params.get('translations', []):
+ lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+ if lang_id not in subtitles:
+ subtitles[lang_id] = []
+ subtitles[lang_id].append({
+ 'url': translation['vttPath'],
+ 'ext': 'vtt',
+ })
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': player_params['title'],
+ 'description': self._og_search_description(webpage),
+ 'duration': duration,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 502a9c25a..3ae618e71 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -2,13 +2,13 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
ExtractorError,
- xpath_text,
float_or_none,
+ xpath_text,
)
@@ -38,9 +38,11 @@ class AdultSwimIE(InfoExtractor):
},
],
'info_dict': {
+ 'id': 'rQxZvXQ4ROaSOqq-or2Mow',
'title': 'Rick and Morty - Pilot',
'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- }
+ },
+ 'skip': 'This video is only available for registered users',
}, {
'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
'playlist': [
@@ -55,9 +57,28 @@ class AdultSwimIE(InfoExtractor):
}
],
'info_dict': {
+ 'id': '-t8CamQlQ2aYZ49ItZCFog',
'title': 'American Dad - Putting Francine Out of Business',
'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
},
+ }, {
+ 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
+ 'playlist': [
+ {
+ 'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
+ 'info_dict': {
+ 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
+ 'ext': 'flv',
+ 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
+ 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
+ },
+ }
+ ],
+ 'info_dict': {
+ 'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
+ 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
+ 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
+ },
}]
@staticmethod
@@ -78,6 +99,7 @@ class AdultSwimIE(InfoExtractor):
for video in collection.get('videos'):
if video.get('slug') == slug:
return collection, video
+ return None, None
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -88,30 +110,39 @@ class AdultSwimIE(InfoExtractor):
webpage = self._download_webpage(url, episode_path)
# Extract the value of `bootstrappedData` from the Javascript in the page.
- bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path)
-
- try:
- bootstrappedData = json.loads(bootstrappedDataJS)
- except ValueError as ve:
- errmsg = '%s: Failed to parse JSON ' % episode_path
- raise ExtractorError(errmsg, cause=ve)
+ bootstrapped_data = self._parse_json(self._search_regex(
+ r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
# Downloading videos from a /videos/playlist/ URL needs to be handled differently.
# NOTE: We are only downloading one video (the current one) not the playlist
if is_playlist:
- collections = bootstrappedData['playlists']['collections']
+ collections = bootstrapped_data['playlists']['collections']
collection = self.find_collection_by_linkURL(collections, show_path)
video_info = self.find_video_info(collection, episode_path)
show_title = video_info['showTitle']
segment_ids = [video_info['videoPlaybackID']]
else:
- collections = bootstrappedData['show']['collections']
+ collections = bootstrapped_data['show']['collections']
collection, video_info = self.find_collection_containing_video(collections, episode_path)
-
- show = bootstrappedData['show']
+ # Video wasn't found in the collections, let's try `slugged_video`.
+ if video_info is None:
+ if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
+ video_info = bootstrapped_data['slugged_video']
+ else:
+ raise ExtractorError('Unable to find video info')
+
+ show = bootstrapped_data['show']
show_title = show['title']
- segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
+ stream = video_info.get('stream')
+ clips = [stream] if stream else video_info.get('clips')
+ if not clips:
+ raise ExtractorError(
+ 'This video is only available via cable service provider subscription that'
+ ' is not currently supported. You may want to use --cookies.'
+ if video_info.get('auth') is True else 'Unable to find stream or clips',
+ expected=True)
+ segment_ids = [clip['videoPlaybackID'] for clip in clips]
episode_id = video_info['id']
episode_title = video_info['title']
@@ -120,7 +151,7 @@ class AdultSwimIE(InfoExtractor):
entries = []
for part_num, segment_id in enumerate(segment_ids):
- segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id
+ segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id
segment_title = '%s - %s' % (show_title, episode_title)
if len(segment_ids) > 1:
@@ -134,19 +165,32 @@ class AdultSwimIE(InfoExtractor):
xpath_text(idoc, './/trt', 'segment duration').strip())
formats = []
- file_els = idoc.findall('.//files/file')
+ file_els = idoc.findall('.//files/file') or idoc.findall('./files/file')
+ unique_urls = []
+ unique_file_els = []
for file_el in file_els:
+ media_url = file_el.text
+ if not media_url or determine_ext(media_url) == 'f4m':
+ continue
+ if file_el.text not in unique_urls:
+ unique_urls.append(file_el.text)
+ unique_file_els.append(file_el)
+
+ for file_el in unique_file_els:
bitrate = file_el.attrib.get('bitrate')
ftype = file_el.attrib.get('type')
-
- formats.append({
- 'format_id': '%s_%s' % (bitrate, ftype),
- 'url': file_el.text.strip(),
- # The bitrate may not be a number (for example: 'iphone')
- 'tbr': int(bitrate) if bitrate.isdigit() else None,
- 'quality': 1 if ftype == 'hd' else -1
- })
+ media_url = file_el.text
+ if determine_ext(media_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, segment_title, 'mp4', preference=0, m3u8_id='hls'))
+ else:
+ formats.append({
+ 'format_id': '%s_%s' % (bitrate, ftype),
+ 'url': file_el.text.strip(),
+ # The bitrate may not be a number (for example: 'iphone')
+ 'tbr': int(bitrate) if bitrate.isdigit() else None,
+ })
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py
index 2b257ede7..0c00acfb5 100644
--- a/youtube_dl/extractor/aftenposten.py
+++ b/youtube_dl/extractor/aftenposten.py
@@ -1,23 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- parse_iso8601,
- xpath_with_ns,
- xpath_text,
- find_xpath_attr,
-)
class AftenpostenIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/([^/]+/)*(?P<id>[^/]+)-\d+\.html'
-
+ _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.aftenposten.no/webtv/serier-og-programmer/sweatshopenglish/TRAILER-SWEATSHOP---I-cant-take-any-more-7800835.html?paging=&section=webtv_serierogprogrammer_sweatshop_sweatshopenglish',
+ 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more',
'md5': 'fd828cd29774a729bf4d4425fe192972',
'info_dict': {
'id': '21039',
@@ -30,74 +20,4 @@ class AftenpostenIE(InfoExtractor):
}
def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(url, display_id)
-
- video_id = self._html_search_regex(
- r'data-xs-id="(\d+)"', webpage, 'video id')
-
- data = self._download_xml(
- 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id)
-
- NS_MAP = {
- 'atom': 'http://www.w3.org/2005/Atom',
- 'xt': 'http://xstream.dk/',
- 'media': 'http://search.yahoo.com/mrss/',
- }
-
- entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
-
- title = xpath_text(
- entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
- description = xpath_text(
- entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
- timestamp = parse_iso8601(xpath_text(
- entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
-
- formats = []
- media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
- for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
- media_url = media_content.get('url')
- if not media_url:
- continue
- tbr = int_or_none(media_content.get('bitrate'))
- mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
- if mobj:
- formats.append({
- 'url': mobj.group('url'),
- 'play_path': 'mp4:%s' % mobj.group('playpath'),
- 'app': mobj.group('app'),
- 'ext': 'flv',
- 'tbr': tbr,
- 'format_id': 'rtmp-%d' % tbr,
- })
- else:
- formats.append({
- 'url': media_url,
- 'tbr': tbr,
- })
- self._sort_formats(formats)
-
- link = find_xpath_attr(
- entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
- if link is not None:
- formats.append({
- 'url': link.get('href'),
- 'format_id': link.get('rel'),
- })
-
- thumbnails = [{
- 'url': splash.get('url'),
- 'width': int_or_none(splash.get('width')),
- 'height': int_or_none(splash.get('height')),
- } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'timestamp': timestamp,
- 'formats': formats,
- 'thumbnails': thumbnails,
- }
+ return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream')
diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py
index 8442019ea..e0518cf26 100644
--- a/youtube_dl/extractor/aftonbladet.py
+++ b/youtube_dl/extractor/aftonbladet.py
@@ -2,14 +2,15 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import int_or_none
class AftonbladetIE(InfoExtractor):
- _VALID_URL = r'^http://tv\.aftonbladet\.se/webbtv.+?(?P<video_id>article[0-9]+)\.ab(?:$|[?#])'
+ _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+ 'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'info_dict': {
- 'id': 'article36015',
+ 'id': '36015',
'ext': 'mp4',
'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
'description': 'Jupiters måne mest aktiv av alla himlakroppar',
@@ -24,8 +25,9 @@ class AftonbladetIE(InfoExtractor):
# find internal video meta data
meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
- internal_meta_id = self._html_search_regex(
- r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+ player_config = self._parse_json(self._html_search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
+ internal_meta_id = player_config['videoId']
internal_meta_url = meta_url % internal_meta_id
internal_meta_json = self._download_json(
internal_meta_url, video_id, 'Downloading video meta data')
@@ -43,9 +45,9 @@ class AftonbladetIE(InfoExtractor):
formats.append({
'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']),
'ext': 'mp4',
- 'width': fmt['width'],
- 'height': fmt['height'],
- 'tbr': fmt['bitrate'],
+ 'width': int_or_none(fmt.get('width')),
+ 'height': int_or_none(fmt.get('height')),
+ 'tbr': int_or_none(fmt.get('bitrate')),
'protocol': 'http',
})
self._sort_formats(formats)
@@ -54,9 +56,9 @@ class AftonbladetIE(InfoExtractor):
'id': video_id,
'title': internal_meta_json['title'],
'formats': formats,
- 'thumbnail': internal_meta_json['imageUrl'],
- 'description': internal_meta_json['shortPreamble'],
- 'timestamp': internal_meta_json['timePublished'],
- 'duration': internal_meta_json['duration'],
- 'view_count': internal_meta_json['views'],
+ 'thumbnail': internal_meta_json.get('imageUrl'),
+ 'description': internal_meta_json.get('shortPreamble'),
+ 'timestamp': int_or_none(internal_meta_json.get('timePublished')),
+ 'duration': int_or_none(internal_meta_json.get('duration')),
+ 'view_count': int_or_none(internal_meta_json.get('views')),
}
diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py
new file mode 100644
index 000000000..f8e70f4e5
--- /dev/null
+++ b/youtube_dl/extractor/airmozilla.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+)
+
+
+class AirMozillaIE(InfoExtractor):
+ _VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?'
+ _TEST = {
+ 'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/',
+ 'md5': '2e3e7486ba5d180e829d453875b9b8bf',
+ 'info_dict': {
+ 'id': '6x4q2w',
+ 'ext': 'mp4',
+ 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco',
+ 'thumbnail': 're:https?://vid\.ly/(?P<id>[0-9a-z-]+)/poster',
+ 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...',
+ 'timestamp': 1422487800,
+ 'upload_date': '20150128',
+ 'location': 'SFO Commons',
+ 'duration': 3780,
+ 'view_count': int,
+ 'categories': ['Main', 'Privacy'],
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._html_search_regex(r'//vid.ly/(.*?)/embed', webpage, 'id')
+
+ embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id)
+ jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata')
+ metadata = self._parse_json(jwconfig, video_id)
+
+ formats = [{
+ 'url': source['file'],
+ 'ext': source['type'],
+ 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'),
+ 'format': source['label'],
+ 'height': int(source['label'].rstrip('p')),
+ } for source in metadata['playlist'][0]['sources']]
+ self._sort_formats(formats)
+
+ view_count = int_or_none(self._html_search_regex(
+ r'Views since archived: ([0-9]+)',
+ webpage, 'view count', fatal=False))
+ timestamp = parse_iso8601(self._html_search_regex(
+ r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False))
+ duration = parse_duration(self._search_regex(
+ r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)',
+ webpage, 'duration', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'formats': formats,
+ 'url': self._og_search_url(webpage),
+ 'display_id': display_id,
+ 'thumbnail': metadata['playlist'][0].get('image'),
+ 'description': self._og_search_description(webpage),
+ 'timestamp': timestamp,
+ 'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None),
+ 'duration': duration,
+ 'view_count': view_count,
+ 'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage),
+ }
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
index 612708e25..5b2c0dc9a 100644
--- a/youtube_dl/extractor/aljazeera.py
+++ b/youtube_dl/extractor/aljazeera.py
@@ -15,7 +15,8 @@ class AlJazeeraIE(InfoExtractor):
'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
'uploader': 'Al Jazeera English',
},
- 'add_ie': ['Brightcove'],
+ 'add_ie': ['BrightcoveLegacy'],
+ 'skip': 'Not accessible from Travis CI server',
}
def _real_extract(self, url):
@@ -31,5 +32,5 @@ class AlJazeeraIE(InfoExtractor):
'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc'
'&%40videoPlayer={0}'.format(brightcove_id)
),
- 'ie_key': 'Brightcove',
+ 'ie_key': 'BrightcoveLegacy',
}
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py
index 31f0d417c..23f942ae2 100644
--- a/youtube_dl/extractor/anitube.py
+++ b/youtube_dl/extractor/anitube.py
@@ -26,8 +26,8 @@ class AnitubeIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- key = self._html_search_regex(
- r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)', webpage, 'key')
+ key = self._search_regex(
+ r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key')
config_xml = self._download_xml(
'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, key)
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
new file mode 100644
index 000000000..ea7a70393
--- /dev/null
+++ b/youtube_dl/extractor/appleconnect.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ str_to_int,
+ ExtractorError
+)
+
+
+class AppleConnectIE(InfoExtractor):
+ _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
+ _TEST = {
+ 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+ 'md5': '10d0f2799111df4cb1c924520ca78f98',
+ 'info_dict': {
+ 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+ 'ext': 'm4v',
+ 'title': 'Energy',
+ 'uploader': 'Drake',
+ 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+ 'upload_date': '20150710',
+ 'timestamp': 1436545535,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ try:
+ video_json = self._html_search_regex(
+ r'class="auc-video-data">(\{.*?\})', webpage, 'json')
+ except ExtractorError:
+ raise ExtractorError('This post doesn\'t contain a video', expected=True)
+
+ video_data = self._parse_json(video_json, video_id)
+ timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
+ like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
+
+ return {
+ 'id': video_id,
+ 'url': video_data['sslSrc'],
+ 'title': video_data['title'],
+ 'description': video_data['description'],
+ 'uploader': video_data['artistName'],
+ 'thumbnail': video_data['artworkUrl'],
+ 'timestamp': timestamp,
+ 'like_count': like_count,
+ }
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 287f71e07..f68dc3236 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -11,56 +11,62 @@ from ..utils import (
class AppleTrailersIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
- _TEST = {
- "url": "http://trailers.apple.com/trailers/wb/manofsteel/",
- "playlist": [
+ _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
+ 'info_dict': {
+ 'id': 'manofsteel',
+ },
+ 'playlist': [
{
- "md5": "d97a8e575432dbcb81b7c3acb741f8a8",
- "info_dict": {
- "id": "manofsteel-trailer4",
- "ext": "mov",
- "duration": 111,
- "title": "Trailer 4",
- "upload_date": "20130523",
- "uploader_id": "wb",
+ 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
+ 'info_dict': {
+ 'id': 'manofsteel-trailer4',
+ 'ext': 'mov',
+ 'duration': 111,
+ 'title': 'Trailer 4',
+ 'upload_date': '20130523',
+ 'uploader_id': 'wb',
},
},
{
- "md5": "b8017b7131b721fb4e8d6f49e1df908c",
- "info_dict": {
- "id": "manofsteel-trailer3",
- "ext": "mov",
- "duration": 182,
- "title": "Trailer 3",
- "upload_date": "20130417",
- "uploader_id": "wb",
+ 'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
+ 'info_dict': {
+ 'id': 'manofsteel-trailer3',
+ 'ext': 'mov',
+ 'duration': 182,
+ 'title': 'Trailer 3',
+ 'upload_date': '20130417',
+ 'uploader_id': 'wb',
},
},
{
- "md5": "d0f1e1150989b9924679b441f3404d48",
- "info_dict": {
- "id": "manofsteel-trailer",
- "ext": "mov",
- "duration": 148,
- "title": "Trailer",
- "upload_date": "20121212",
- "uploader_id": "wb",
+ 'md5': 'd0f1e1150989b9924679b441f3404d48',
+ 'info_dict': {
+ 'id': 'manofsteel-trailer',
+ 'ext': 'mov',
+ 'duration': 148,
+ 'title': 'Trailer',
+ 'upload_date': '20121212',
+ 'uploader_id': 'wb',
},
},
{
- "md5": "5fe08795b943eb2e757fa95cb6def1cb",
- "info_dict": {
- "id": "manofsteel-teaser",
- "ext": "mov",
- "duration": 93,
- "title": "Teaser",
- "upload_date": "20120721",
- "uploader_id": "wb",
+ 'md5': '5fe08795b943eb2e757fa95cb6def1cb',
+ 'info_dict': {
+ 'id': 'manofsteel-teaser',
+ 'ext': 'mov',
+ 'duration': 93,
+ 'title': 'Teaser',
+ 'upload_date': '20120721',
+ 'uploader_id': 'wb',
},
},
]
- }
+ }, {
+ 'url': 'http://trailers.apple.com/ca/metropole/autrui/',
+ 'only_matching': True,
+ }]
_JSON_RE = r'iTunes.playURL\((.*?)\);'
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index 9fc35a42b..8feb7cb74 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -33,7 +33,7 @@ class ArchiveOrgIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- json_url = url + ('?' if '?' in url else '&') + 'output=json'
+ json_url = url + ('&' if '?' in url else '?') + 'output=json'
data = self._download_json(json_url, video_id)
def get_optional(data_dict, field):
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 783b53e23..73be6d204 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -8,13 +8,14 @@ from .generic import GenericIE
from ..utils import (
determine_ext,
ExtractorError,
+ get_element_by_attribute,
qualities,
int_or_none,
parse_duration,
unified_strdate,
xpath_text,
- parse_xml,
)
+from ..compat import compat_etree_fromstring
class ARDMediathekIE(InfoExtractor):
@@ -22,19 +23,125 @@ class ARDMediathekIE(InfoExtractor):
_VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
- 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
- 'only_matching': True,
+ 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114',
+ 'info_dict': {
+ 'id': '29582122',
+ 'ext': 'mp4',
+ 'title': 'Ich liebe das Leben trotzdem',
+ 'description': 'md5:45e4c225c72b27993314b31a84a5261c',
+ 'duration': 4557,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
- 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
+ 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',
+ 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e',
'info_dict': {
- 'id': '22490580',
+ 'id': '29522730',
'ext': 'mp4',
- 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
- 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
+ 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)',
+ 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',
+ 'duration': 5252,
+ },
+ }, {
+ # audio
+ 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
+ 'md5': '219d94d8980b4f538c7fcb0865eb7f2c',
+ 'info_dict': {
+ 'id': '28488308',
+ 'ext': 'mp3',
+ 'title': 'Tod eines Fußballers',
+ 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
+ 'duration': 3240,
},
- 'skip': 'Blocked outside of Germany',
+ }, {
+ 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
+ 'only_matching': True,
}]
+ def _extract_media_info(self, media_info_url, webpage, video_id):
+ media_info = self._download_json(
+ media_info_url, video_id, 'Downloading media JSON')
+
+ formats = self._extract_formats(media_info, video_id)
+
+ if not formats:
+ if '"fsk"' in webpage:
+ raise ExtractorError(
+ 'This video is only available after 20:00', expected=True)
+ elif media_info.get('_geoblocked'):
+ raise ExtractorError('This video is not available due to geo restriction', expected=True)
+
+ self._sort_formats(formats)
+
+ duration = int_or_none(media_info.get('_duration'))
+ thumbnail = media_info.get('_previewImage')
+
+ subtitles = {}
+ subtitle_url = media_info.get('_subtitleUrl')
+ if subtitle_url:
+ subtitles['de'] = [{
+ 'ext': 'srt',
+ 'url': subtitle_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _extract_formats(self, media_info, video_id):
+ type_ = media_info.get('_type')
+ media_array = media_info.get('_mediaArray', [])
+ formats = []
+ for num, media in enumerate(media_array):
+ for stream in media.get('_mediaStreamArray', []):
+ stream_urls = stream.get('_stream')
+ if not stream_urls:
+ continue
+ if not isinstance(stream_urls, list):
+ stream_urls = [stream_urls]
+ quality = stream.get('_quality')
+ server = stream.get('_server')
+ for stream_url in stream_urls:
+ ext = determine_ext(stream_url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
+ video_id, preference=-1, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
+ else:
+ if server and server.startswith('rtmp'):
+ f = {
+ 'url': server,
+ 'play_path': stream_url,
+ 'format_id': 'a%s-rtmp-%s' % (num, quality),
+ }
+ elif stream_url.startswith('http'):
+ f = {
+ 'url': stream_url,
+ 'format_id': 'a%s-%s-%s' % (num, ext, quality)
+ }
+ else:
+ continue
+ m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
+ if m:
+ f.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+ if type_ == 'audio':
+ f['vcodec'] = 'none'
+ formats.append(f)
+ return formats
+
def _real_extract(self, url):
# determine video id from url
m = re.match(self._VALID_URL, url)
@@ -50,8 +157,11 @@ class ARDMediathekIE(InfoExtractor):
if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
+ if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
+ raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
+
if re.search(r'[\?&]rss($|[=&])', url):
- doc = parse_xml(webpage)
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc)
@@ -89,46 +199,22 @@ class ARDMediathekIE(InfoExtractor):
'format_id': fid,
'url': furl,
})
+ self._sort_formats(formats)
+ info = {
+ 'formats': formats,
+ }
else: # request JSON file
- media_info = self._download_json(
- 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
- # The second element of the _mediaArray contains the standard http urls
- streams = media_info['_mediaArray'][1]['_mediaStreamArray']
- if not streams:
- if '"fsk"' in webpage:
- raise ExtractorError('This video is only available after 20:00')
+ info = self._extract_media_info(
+ 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
- formats = []
- for s in streams:
- if type(s['_stream']) == list:
- for index, url in enumerate(s['_stream'][::-1]):
- quality = s['_quality'] + index
- formats.append({
- 'quality': quality,
- 'url': url,
- 'format_id': '%s-%s' % (determine_ext(url), quality)
- })
- continue
-
- format = {
- 'quality': s['_quality'],
- 'url': s['_stream'],
- }
-
- format['format_id'] = '%s-%s' % (
- determine_ext(format['url']), format['quality'])
-
- formats.append(format)
-
- self._sort_formats(formats)
-
- return {
+ info.update({
'id': video_id,
'title': title,
'description': description,
- 'formats': formats,
'thumbnail': thumbnail,
- }
+ })
+
+ return info
class ARDIE(InfoExtractor):
@@ -186,3 +272,41 @@ class ARDIE(InfoExtractor):
'upload_date': upload_date,
'thumbnail': thumbnail,
}
+
+
+class SportschauIE(ARDMediathekIE):
+ IE_NAME = 'Sportschau'
+ _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
+ _TESTS = [{
+ 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
+ 'info_dict': {
+ 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
+ 'ext': 'mp4',
+ 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ base_url = mobj.group('baseurl')
+
+ webpage = self._download_webpage(url, video_id)
+ title = get_element_by_attribute('class', 'headline', webpage)
+ description = self._html_search_meta('description', webpage, 'description')
+
+ info = self._extract_media_info(
+ base_url + '-mc_defaultQuality-h.json', webpage, video_id)
+
+ info.update({
+ 'title': title,
+ 'description': description,
+ })
+
+ return info
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 929dd3cc5..2a00da3ee 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -4,10 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
find_xpath_attr,
unified_strdate,
- get_element_by_id,
get_element_by_attribute,
int_or_none,
qualities,
@@ -78,7 +81,13 @@ class ArteTVPlus7IE(InfoExtractor):
def _extract_from_webpage(self, webpage, video_id, lang):
json_url = self._html_search_regex(
[r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'],
- webpage, 'json vp url')
+ webpage, 'json vp url', default=None)
+ if not json_url:
+ iframe_url = self._html_search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
+ webpage, 'iframe url', group='url')
+ json_url = compat_parse_qs(
+ compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
return self._extract_from_json_url(json_url, video_id, lang)
def _extract_from_json_url(self, json_url, video_id, lang):
@@ -146,6 +155,7 @@ class ArteTVPlus7IE(InfoExtractor):
formats.append(format)
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
info_dict['formats'] = formats
@@ -194,7 +204,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
def _real_extract(self, url):
anchor_id, lang = self._extract_url_info(url)
webpage = self._download_webpage(url, anchor_id)
- row = get_element_by_id(anchor_id, webpage)
+ row = self._search_regex(
+ r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+ webpage, 'row')
return self._extract_from_webpage(row, anchor_id, lang)
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index f016368fa..50e47ba0a 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -3,22 +3,23 @@ from __future__ import unicode_literals
import time
import hmac
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urllib_parse,
- compat_urllib_request,
)
from ..utils import (
int_or_none,
float_or_none,
+ sanitized_Request,
xpath_text,
ExtractorError,
)
-class AtresPlayerIE(SubtitlesInfoExtractor):
+class AtresPlayerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P<id>.+?)_\d+\.html'
+ _NETRC_MACHINE = 'atresplayer'
_TESTS = [
{
'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html',
@@ -62,7 +63,7 @@ class AtresPlayerIE(SubtitlesInfoExtractor):
'j_password': password,
}
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
response = self._download_webpage(
@@ -93,7 +94,7 @@ class AtresPlayerIE(SubtitlesInfoExtractor):
formats = []
for fmt in ['windows', 'android_tablet']:
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token))
request.add_header('User-Agent', self._USER_AGENT)
@@ -144,13 +145,12 @@ class AtresPlayerIE(SubtitlesInfoExtractor):
thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail')
subtitles = {}
- subtitle = xpath_text(episode, './media/asset/files/subtitle', 'subtitle')
- if subtitle:
- subtitles['es'] = subtitle
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
+ subtitle_url = xpath_text(episode, './media/asset/files/subtitle', 'subtitle')
+ if subtitle_url:
+ subtitles['es'] = [{
+ 'ext': 'srt',
+ 'url': subtitle_url,
+ }]
return {
'id': video_id,
@@ -159,5 +159,5 @@ class AtresPlayerIE(SubtitlesInfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
- 'subtitles': self.extract_subtitles(video_id, subtitles),
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py
new file mode 100644
index 000000000..e37ee4440
--- /dev/null
+++ b/youtube_dl/extractor/baidu.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+
+
+class BaiduVideoIE(InfoExtractor):
+ IE_DESC = '百度视频'
+ _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
+ _TESTS = [{
+ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
+ 'info_dict': {
+ 'id': '1069',
+ 'title': '中华小当家 TV版 (全52集)',
+ 'description': 'md5:395a419e41215e531c857bb037bbaf80',
+ },
+ 'playlist_count': 52,
+ }, {
+ 'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand',
+ 'info_dict': {
+ 'id': '11595',
+ 'title': 're:^奔跑吧兄弟',
+ 'description': 'md5:1bf88bad6d850930f542d51547c089b8',
+ },
+ 'playlist_mincount': 3,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ category = category2 = mobj.group('type')
+ if category == 'show':
+ category2 = 'tvshow'
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ playlist_title = self._html_search_regex(
+ r'title\s*:\s*(["\'])(?P<title>[^\']+)\1', webpage,
+ 'playlist title', group='title')
+ playlist_description = self._html_search_regex(
+ r'<input[^>]+class="j-data-intro"[^>]+value="([^"]+)"/>', webpage,
+ playlist_id, 'playlist description')
+
+ site = self._html_search_regex(
+ r'filterSite\s*:\s*["\']([^"]*)["\']', webpage,
+ 'primary provider site')
+ api_result = self._download_json(
+ 'http://v.baidu.com/%s_intro/?dtype=%sPlayUrl&id=%s&site=%s' % (
+ category, category2, playlist_id, site),
+ playlist_id, 'Get playlist links')
+
+ entries = []
+ for episode in api_result[0]['episodes']:
+ episode_id = '%s_%s' % (playlist_id, episode['episode'])
+
+ redirect_page = self._download_webpage(
+ compat_urlparse.urljoin(url, episode['url']), episode_id,
+ note='Download Baidu redirect page')
+ real_url = self._html_search_regex(
+ r'location\.replace\("([^"]+)"\)', redirect_page, 'real URL')
+
+ entries.append(self.url_result(
+ real_url, video_title=episode['single_title']))
+
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
index c193e66ca..da986e063 100644
--- a/youtube_dl/extractor/bambuser.py
+++ b/youtube_dl/extractor/bambuser.py
@@ -1,12 +1,18 @@
from __future__ import unicode_literals
import re
-import json
import itertools
from .common import InfoExtractor
from ..compat import (
- compat_urllib_request,
+ compat_urllib_parse,
+ compat_str,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ float_or_none,
+ sanitized_Request,
)
@@ -14,6 +20,8 @@ class BambuserIE(InfoExtractor):
IE_NAME = 'bambuser'
_VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
_API_KEY = '005f64509e19a868399060af746a00aa'
+ _LOGIN_URL = 'https://bambuser.com/user'
+ _NETRC_MACHINE = 'bambuser'
_TEST = {
'url': 'http://bambuser.com/v/4050584',
@@ -26,6 +34,9 @@ class BambuserIE(InfoExtractor):
'duration': 3741,
'uploader': 'pixelversity',
'uploader_id': '344706',
+ 'timestamp': 1382976692,
+ 'upload_date': '20131028',
+ 'view_count': int,
},
'params': {
# It doesn't respect the 'Range' header, it would download the whole video
@@ -34,23 +45,60 @@ class BambuserIE(InfoExtractor):
},
}
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'form_id': 'user_login',
+ 'op': 'Log in',
+ 'name': username,
+ 'pass': password,
+ }
+
+ request = sanitized_Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ request.add_header('Referer', self._LOGIN_URL)
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ login_error = self._html_search_regex(
+ r'(?s)<div class="messages error">(.+?)</div>',
+ response, 'login error', default=None)
+ if login_error:
+ raise ExtractorError(
+ 'Unable to login: %s' % login_error, expected=True)
+
+ def _real_initialize(self):
+ self._login()
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- info_url = ('http://player-c.api.bambuser.com/getVideo.json?'
- '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
- info_json = self._download_webpage(info_url, video_id)
- info = json.loads(info_json)['result']
+ video_id = self._match_id(url)
+
+ info = self._download_json(
+ 'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s'
+ % (self._API_KEY, video_id), video_id)
+
+ error = info.get('error')
+ if error:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error), expected=True)
+
+ result = info['result']
return {
'id': video_id,
- 'title': info['title'],
- 'url': info['url'],
- 'thumbnail': info.get('preview'),
- 'duration': int(info['length']),
- 'view_count': int(info['views_total']),
- 'uploader': info['username'],
- 'uploader_id': info['owner']['uid'],
+ 'title': result['title'],
+ 'url': result['url'],
+ 'thumbnail': result.get('preview'),
+ 'duration': int_or_none(result.get('length')),
+ 'uploader': result.get('username'),
+ 'uploader_id': compat_str(result.get('owner', {}).get('uid')),
+ 'timestamp': int_or_none(result.get('created')),
+ 'fps': float_or_none(result.get('framerate')),
+ 'view_count': int_or_none(result.get('views_total')),
+ 'comment_count': int_or_none(result.get('comment_count')),
}
@@ -78,7 +126,7 @@ class BambuserChannelIE(InfoExtractor):
'&sort=created&access_mode=0%2C1%2C2&limit={count}'
'&method=broadcast&format=json&vid_older_than={last}'
).format(user=user, count=self._STEP, last=last_id)
- req = compat_urllib_request.Request(req_url)
+ req = sanitized_Request(req_url)
# Without setting this header, we wouldn't get any result
req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
data = self._download_json(
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 490cc961a..c1ef8051d 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -10,6 +10,8 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ float_or_none,
+ int_or_none,
)
@@ -52,11 +54,11 @@ class BandcampIE(InfoExtractor):
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
- 'url': format_url,
+ 'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
- 'abr': int(abr_str),
+ 'abr': int_or_none(abr_str),
})
self._sort_formats(formats)
@@ -65,14 +67,14 @@ class BandcampIE(InfoExtractor):
'id': compat_str(data['id']),
'title': data['title'],
'formats': formats,
- 'duration': float(data['duration']),
+ 'duration': float_or_none(data.get('duration')),
}
else:
raise ExtractorError('No free songs found')
download_link = m_download.group(1)
video_id = self._search_regex(
- r'(?ms)var TralbumData = {.*?id: (?P<id>\d+),?$',
+ r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
webpage, 'video id')
download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
@@ -93,8 +95,8 @@ class BandcampIE(InfoExtractor):
final_url_webpage = self._download_webpage(request_url, video_id, 'Requesting download url')
# If we could correctly generate the .rand field the url would be
# in the "download_url" key
- final_url = self._search_regex(
- r'"retry_url":"(.*?)"', final_url_webpage, 'final video URL')
+ final_url = self._proto_relative_url(self._search_regex(
+ r'"retry_url":"(.+?)"', final_url_webpage, 'final video URL'), 'http:')
return {
'id': video_id,
@@ -109,7 +111,7 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+)|/?(?:$|[?#]))'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -133,31 +135,37 @@ class BandcampAlbumIE(InfoExtractor):
],
'info_dict': {
'title': 'Jazz Format Mixtape vol.1',
+ 'id': 'jazz-format-mixtape-vol-1',
+ 'uploader_id': 'blazo',
},
'params': {
'playlistend': 2
},
- 'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+ 'skip': 'Bandcamp imposes download limits.'
}, {
'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
'info_dict': {
'title': 'Hierophany of the Open Grave',
+ 'uploader_id': 'nightbringer',
+ 'id': 'hierophany-of-the-open-grave',
},
'playlist_mincount': 9,
}, {
'url': 'http://dotscale.bandcamp.com',
'info_dict': {
'title': 'Loom',
+ 'id': 'dotscale',
+ 'uploader_id': 'dotscale',
},
'playlist_mincount': 7,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('subdomain')
- title = mobj.group('title')
- display_id = title or playlist_id
- webpage = self._download_webpage(url, display_id)
+ uploader_id = mobj.group('subdomain')
+ album_id = mobj.group('album_id')
+ playlist_id = album_id or uploader_id
+ webpage = self._download_webpage(url, playlist_id)
tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
if not tracks_paths:
raise ExtractorError('The page doesn\'t contain any tracks')
@@ -168,8 +176,8 @@ class BandcampAlbumIE(InfoExtractor):
r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False)
return {
'_type': 'playlist',
+ 'uploader_id': uploader_id,
'id': playlist_id,
- 'display_id': display_id,
'title': title,
'entries': entries,
}
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
new file mode 100644
index 000000000..33b296eaf
--- /dev/null
+++ b/youtube_dl/extractor/bbc.py
@@ -0,0 +1,937 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ remove_end,
+ unescapeHTML,
+)
+from ..compat import (
+ compat_etree_fromstring,
+ compat_HTTPError,
+)
+
+
+class BBCCoUkIE(InfoExtractor):
+ IE_NAME = 'bbc.co.uk'
+ IE_DESC = 'BBC iPlayer'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P<id>[\da-z]{8})'
+
+ _MEDIASELECTOR_URLS = [
+ # Provides HQ HLS streams with even better quality that pc mediaset but fails
+ # with geolocation in some cases when it's even not geo restricted at all (e.g.
+ # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+ ]
+
+ _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
+ _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
+
+ _NAMESPACES = (
+ _MEDIASELECTION_NS,
+ _EMP_PLAYLIST_NS,
+ )
+
+ _TESTS = [
+ {
+ 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
+ 'info_dict': {
+ 'id': 'b039d07m',
+ 'ext': 'flv',
+ 'title': 'Kaleidoscope, Leonard Cohen',
+ 'description': 'The Canadian poet and songwriter reflects on his musical career.',
+ 'duration': 1740,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ },
+ {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
+ 'info_dict': {
+ 'id': 'b00yng1d',
+ 'ext': 'flv',
+ 'title': 'The Man in Black: Series 3: The Printed Name',
+ 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
+ 'duration': 1800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Episode is no longer available on BBC iPlayer Radio',
+ },
+ {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
+ 'info_dict': {
+ 'id': 'b00yng1d',
+ 'ext': 'flv',
+ 'title': 'The Voice UK: Series 3: Blind Auditions 5',
+ 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
+ 'duration': 5100,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+ },
+ {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
+ 'info_dict': {
+ 'id': 'b03k3pb7',
+ 'ext': 'flv',
+ 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
+ 'description': '2. Invasion',
+ 'duration': 3600,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
+ 'info_dict': {
+ 'id': 'b04v209v',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, The Essential New Tune Special',
+ 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
+ 'duration': 10800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
+ 'note': 'Audio',
+ 'info_dict': {
+ 'id': 'p02frcch',
+ 'ext': 'flv',
+ 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
+ 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
+ 'duration': 3507,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
+ 'note': 'Video',
+ 'info_dict': {
+ 'id': 'p025c103',
+ 'ext': 'flv',
+ 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
+ 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
+ 'duration': 226,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
+ 'info_dict': {
+ 'id': 'p02n76xf',
+ 'ext': 'flv',
+ 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
+ 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
+ 'duration': 3540,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'geolocation',
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
+ 'info_dict': {
+ 'id': 'b05zmgw1',
+ 'ext': 'flv',
+ 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
+ 'title': 'Royal Academy Summer Exhibition',
+ 'duration': 3540,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'geolocation',
+ }, {
+ # iptv-all mediaset fails with geolocation however there is no geo restriction
+ # for this programme at all
+ 'url': 'http://www.bbc.co.uk/programmes/b06bp7lf',
+ 'info_dict': {
+ 'id': 'b06bp7kf',
+ 'ext': 'flv',
+ 'title': "Annie Mac's Friday Night, B.Traits sits in for Annie",
+ 'description': 'B.Traits sits in for Annie Mac with a Mini-Mix from Disclosure.',
+ 'duration': 10800,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
+ 'only_matching': True,
+ }
+ ]
+
+ class MediaSelectionError(Exception):
+ def __init__(self, id):
+ self.id = id
+
+ def _extract_asx_playlist(self, connection, programme_id):
+ asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
+ return [ref.get('href') for ref in asx.findall('./Entry/ref')]
+
+ def _extract_connection(self, connection, programme_id):
+ formats = []
+ kind = connection.get('kind')
+ protocol = connection.get('protocol')
+ supplier = connection.get('supplier')
+ if protocol == 'http':
+ href = connection.get('href')
+ transfer_format = connection.get('transferFormat')
+ # ASX playlist
+ if supplier == 'asx':
+ for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
+ formats.append({
+ 'url': ref,
+ 'format_id': 'ref%s_%s' % (i, supplier),
+ })
+ # Skip DASH until supported
+ elif transfer_format == 'dash':
+ pass
+ elif transfer_format == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ href, programme_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id=supplier, fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ # Direct link
+ else:
+ formats.append({
+ 'url': href,
+ 'format_id': supplier or kind or protocol,
+ })
+ elif protocol == 'rtmp':
+ application = connection.get('application', 'ondemand')
+ auth_string = connection.get('authString')
+ identifier = connection.get('identifier')
+ server = connection.get('server')
+ formats.append({
+ 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
+ 'play_path': identifier,
+ 'app': '%s?%s' % (application, auth_string),
+ 'page_url': 'http://www.bbc.co.uk',
+ 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
+ 'rtmp_live': False,
+ 'ext': 'flv',
+ 'format_id': supplier,
+ })
+ return formats
+
+ def _extract_items(self, playlist):
+ return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
+
+ def _findall_ns(self, element, xpath):
+ elements = []
+ for ns in self._NAMESPACES:
+ elements.extend(element.findall(xpath % ns))
+ return elements
+
+ def _extract_medias(self, media_selection):
+ error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
+ if error is None:
+ media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
+ if error is not None:
+ raise BBCCoUkIE.MediaSelectionError(error.get('id'))
+ return self._findall_ns(media_selection, './{%s}media')
+
+ def _extract_connections(self, media):
+ return self._findall_ns(media, './{%s}connection')
+
+ def _extract_video(self, media, programme_id):
+ formats = []
+ vbr = int_or_none(media.get('bitrate'))
+ vcodec = media.get('encoding')
+ service = media.get('service')
+ width = int_or_none(media.get('width'))
+ height = int_or_none(media.get('height'))
+ file_size = int_or_none(media.get('media_file_size'))
+ for connection in self._extract_connections(media):
+ conn_formats = self._extract_connection(connection, programme_id)
+ for format in conn_formats:
+ format.update({
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
+ 'vcodec': vcodec,
+ 'filesize': file_size,
+ })
+ if service:
+ format['format_id'] = '%s_%s' % (service, format['format_id'])
+ formats.extend(conn_formats)
+ return formats
+
+ def _extract_audio(self, media, programme_id):
+ formats = []
+ abr = int_or_none(media.get('bitrate'))
+ acodec = media.get('encoding')
+ service = media.get('service')
+ for connection in self._extract_connections(media):
+ conn_formats = self._extract_connection(connection, programme_id)
+ for format in conn_formats:
+ format.update({
+ 'format_id': '%s_%s' % (service, format['format_id']),
+ 'abr': abr,
+ 'acodec': acodec,
+ })
+ formats.extend(conn_formats)
+ return formats
+
+ def _get_subtitles(self, media, programme_id):
+ subtitles = {}
+ for connection in self._extract_connections(media):
+ captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
+ lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
+ subtitles[lang] = [
+ {
+ 'url': connection.get('href'),
+ 'ext': 'ttml',
+ },
+ ]
+ return subtitles
+
+ def _raise_extractor_error(self, media_selection_error):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+ expected=True)
+
+ def _download_media_selector(self, programme_id):
+ last_exception = None
+ for mediaselector_url in self._MEDIASELECTOR_URLS:
+ try:
+ return self._download_media_selector_url(
+ mediaselector_url % programme_id, programme_id)
+ except BBCCoUkIE.MediaSelectionError as e:
+ if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
+ last_exception = e
+ continue
+ self._raise_extractor_error(e)
+ self._raise_extractor_error(last_exception)
+
+ def _download_media_selector_url(self, url, programme_id=None):
+ try:
+ media_selection = self._download_xml(
+ url, programme_id, 'Downloading media selection XML')
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
+ media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
+ else:
+ raise
+ return self._process_media_selector(media_selection, programme_id)
+
+ def _process_media_selector(self, media_selection, programme_id):
+ formats = []
+ subtitles = None
+
+ for media in self._extract_medias(media_selection):
+ kind = media.get('kind')
+ if kind == 'audio':
+ formats.extend(self._extract_audio(media, programme_id))
+ elif kind == 'video':
+ formats.extend(self._extract_video(media, programme_id))
+ elif kind == 'captions':
+ subtitles = self.extract_subtitles(media, programme_id)
+ return formats, subtitles
+
+ def _download_playlist(self, playlist_id):
+ try:
+ playlist = self._download_json(
+ 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
+ playlist_id, 'Downloading playlist JSON')
+
+ version = playlist.get('defaultAvailableVersion')
+ if version:
+ smp_config = version['smpConfig']
+ title = smp_config['title']
+ description = smp_config['summary']
+ for item in smp_config['items']:
+ kind = item['kind']
+ if kind != 'programme' and kind != 'radioProgramme':
+ continue
+ programme_id = item.get('vpid')
+ duration = int_or_none(item.get('duration'))
+ formats, subtitles = self._download_media_selector(programme_id)
+ return programme_id, title, description, duration, formats, subtitles
+ except ExtractorError as ee:
+ if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+ raise
+
+ # fallback to legacy playlist
+ return self._process_legacy_playlist(playlist_id)
+
+ def _process_legacy_playlist_url(self, url, display_id):
+ playlist = self._download_legacy_playlist_url(url, display_id)
+ return self._extract_from_legacy_playlist(playlist, display_id)
+
+ def _process_legacy_playlist(self, playlist_id):
+ return self._process_legacy_playlist_url(
+ 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
+
+ def _download_legacy_playlist_url(self, url, playlist_id=None):
+ return self._download_xml(
+ url, playlist_id, 'Downloading legacy playlist XML')
+
+ def _extract_from_legacy_playlist(self, playlist, playlist_id):
+ no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
+ if no_items is not None:
+ reason = no_items.get('reason')
+ if reason == 'preAvailability':
+ msg = 'Episode %s is not yet available' % playlist_id
+ elif reason == 'postAvailability':
+ msg = 'Episode %s is no longer available' % playlist_id
+ elif reason == 'noMedia':
+ msg = 'Episode %s is not currently available' % playlist_id
+ else:
+ msg = 'Episode %s is not available: %s' % (playlist_id, reason)
+ raise ExtractorError(msg, expected=True)
+
+ for item in self._extract_items(playlist):
+ kind = item.get('kind')
+ if kind != 'programme' and kind != 'radioProgramme':
+ continue
+ title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
+ description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
+ description = description_el.text if description_el is not None else None
+
+ def get_programme_id(item):
+ def get_from_attributes(item):
+ for p in('identifier', 'group'):
+ value = item.get(p)
+ if value and re.match(r'^[pb][\da-z]{7}$', value):
+ return value
+ get_from_attributes(item)
+ mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
+ if mediator is not None:
+ return get_from_attributes(mediator)
+
+ programme_id = get_programme_id(item)
+ duration = int_or_none(item.get('duration'))
+
+ if programme_id:
+ formats, subtitles = self._download_media_selector(programme_id)
+ else:
+ formats, subtitles = self._process_media_selector(item, playlist_id)
+ programme_id = playlist_id
+
+ return programme_id, title, description, duration, formats, subtitles
+
+ def _real_extract(self, url):
+ group_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, group_id, 'Downloading video page')
+
+ programme_id = None
+
+ tviplayer = self._search_regex(
+ r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
+ webpage, 'player', default=None)
+
+ if tviplayer:
+ player = self._parse_json(tviplayer, group_id).get('player', {})
+ duration = int_or_none(player.get('duration'))
+ programme_id = player.get('vpid')
+
+ if not programme_id:
+ programme_id = self._search_regex(
+ r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
+
+ if programme_id:
+ formats, subtitles = self._download_media_selector(programme_id)
+ title = self._og_search_title(webpage)
+ description = self._search_regex(
+ r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
+ webpage, 'description', fatal=False)
+ else:
+ programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class BBCIE(BBCCoUkIE):
+ IE_NAME = 'bbc'
+ IE_DESC = 'BBC'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+
+ _MEDIASELECTOR_URLS = [
+ # Provides HQ HLS streams but fails with geolocation in some cases when it's
+ # even not geo restricted at all
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
+ # Provides more formats, namely direct mp4 links, but fails on some videos with
+ # notukerror for non UK (?) users (e.g.
+ # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+ 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
+ # Provides fewer formats, but works everywhere for everybody (hopefully)
+ 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+ ]
+
+ _TESTS = [{
+ # article with multiple videos embedded with data-playable containing vpids
+ 'url': 'http://www.bbc.com/news/world-europe-32668511',
+ 'info_dict': {
+ 'id': 'world-europe-32668511',
+ 'title': 'Russia stages massive WW2 parade despite Western boycott',
+ 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
+ },
+ 'playlist_count': 2,
+ }, {
+ # article with multiple videos embedded with data-playable (more videos)
+ 'url': 'http://www.bbc.com/news/business-28299555',
+ 'info_dict': {
+ 'id': 'business-28299555',
+ 'title': 'Farnborough Airshow: Video highlights',
+ 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
+ },
+ 'playlist_count': 9,
+ 'skip': 'Save time',
+ }, {
+ # article with multiple videos embedded with `new SMP()`
+ # broken
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+ 'info_dict': {
+ 'id': '3662a707-0af9-3149-963f-47bea720b460',
+ 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_count': 18,
+ }, {
+ # single video embedded with data-playable containing vpid
+ 'url': 'http://www.bbc.com/news/world-europe-32041533',
+ 'info_dict': {
+ 'id': 'p02mprgb',
+ 'ext': 'mp4',
+ 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+ 'description': 'md5:2868290467291b37feda7863f7a83f54',
+ 'duration': 47,
+ 'timestamp': 1427219242,
+ 'upload_date': '20150324',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # article with single video embedded with data-playable containing XML playlist
+ # with direct video links as progressiveDownloadUrl (for now these are extracted)
+ # and playlist with f4m and m3u8 as streamingUrl
+ 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
+ 'info_dict': {
+ 'id': '150615_telabyad_kentin_cogu',
+ 'ext': 'mp4',
+ 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
+ 'timestamp': 1434397334,
+ 'upload_date': '20150615',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # single video embedded with data-playable containing XML playlists (regional section)
+ 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
+ 'info_dict': {
+ 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
+ 'ext': 'mp4',
+ 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
+ 'timestamp': 1434713142,
+ 'upload_date': '20150619',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # single video from video playlist embedded with vxp-playlist-data JSON
+ 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
+ 'info_dict': {
+ 'id': 'p02w6qjc',
+ 'ext': 'mp4',
+ 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
+ 'duration': 56,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ # single video story with digitalData
+ 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
+ 'info_dict': {
+ 'id': 'p02q6gc4',
+ 'ext': 'flv',
+ 'title': 'Sri Lanka’s spicy secret',
+ 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
+ 'timestamp': 1437674293,
+ 'upload_date': '20150723',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # single video story without digitalData
+ 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
+ 'info_dict': {
+ 'id': 'p018zqqg',
+ 'ext': 'mp4',
+ 'title': 'Hyundai Santa Fe Sport: Rock star',
+ 'description': 'md5:b042a26142c4154a6e472933cf20793d',
+ 'timestamp': 1415867444,
+ 'upload_date': '20141113',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # single video with playlist.sxml URL in playlist param
+ 'url': 'http://www.bbc.com/sport/0/football/33653409',
+ 'info_dict': {
+ 'id': 'p02xycnp',
+ 'ext': 'mp4',
+ 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
+ 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
+ 'duration': 140,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # article with multiple videos embedded with playlist.sxml in playlist param
+ 'url': 'http://www.bbc.com/sport/0/football/34475836',
+ 'info_dict': {
+ 'id': '34475836',
+ 'title': 'What Liverpool can expect from Klopp',
+ },
+ 'playlist_count': 3,
+ }, {
+ # single video with playlist URL from weather section
+ 'url': 'http://www.bbc.com/weather/features/33601775',
+ 'only_matching': True,
+ }, {
+ # custom redirection to www.bbc.com
+ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
+
+ def _extract_from_media_meta(self, media_meta, video_id):
+ # Direct links to media in media metadata (e.g.
+ # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
+ # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
+ source_files = media_meta.get('sourceFiles')
+ if source_files:
+ return [{
+ 'url': f['url'],
+ 'format_id': format_id,
+ 'ext': f.get('encoding'),
+ 'tbr': float_or_none(f.get('bitrate'), 1000),
+ 'filesize': int_or_none(f.get('filesize')),
+ } for format_id, f in source_files.items() if f.get('url')], []
+
+ programme_id = media_meta.get('externalId')
+ if programme_id:
+ return self._download_media_selector(programme_id)
+
+ # Process playlist.sxml as legacy playlist
+ href = media_meta.get('href')
+ if href:
+ playlist = self._download_legacy_playlist_url(href)
+ _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
+ return formats, subtitles
+
+ return [], []
+
+ def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
+ programme_id, title, description, duration, formats, subtitles = \
+ self._process_legacy_playlist_url(url, playlist_id)
+ self._sort_formats(formats)
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ timestamp = None
+ playlist_title = None
+ playlist_description = None
+
+ ld = self._parse_json(
+ self._search_regex(
+ r'(?s)<script type="application/ld\+json">(.+?)</script>',
+ webpage, 'ld json', default='{}'),
+ playlist_id, fatal=False)
+ if ld:
+ timestamp = parse_iso8601(ld.get('datePublished'))
+ playlist_title = ld.get('headline')
+ playlist_description = ld.get('articleBody')
+
+ if not timestamp:
+ timestamp = parse_iso8601(self._search_regex(
+ [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
+ r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
+ r'"datePublished":\s*"([^"]+)'],
+ webpage, 'date', default=None))
+
+ entries = []
+
+ # article with multiple videos embedded with playlist.sxml (e.g.
+ # http://www.bbc.com/sport/0/football/34475836)
+ playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
+ if playlists:
+ entries = [
+ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
+ for playlist_url in playlists]
+
+ # news article with multiple videos embedded with data-playable
+ data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
+ if data_playables:
+ for _, data_playable_json in data_playables:
+ data_playable = self._parse_json(
+ unescapeHTML(data_playable_json), playlist_id, fatal=False)
+ if not data_playable:
+ continue
+ settings = data_playable.get('settings', {})
+ if settings:
+ # data-playable with video vpid in settings.playlistObject.items (e.g.
+ # http://www.bbc.com/news/world-us-canada-34473351)
+ playlist_object = settings.get('playlistObject', {})
+ if playlist_object:
+ items = playlist_object.get('items')
+ if items and isinstance(items, list):
+ title = playlist_object['title']
+ description = playlist_object.get('summary')
+ duration = int_or_none(items[0].get('duration'))
+ programme_id = items[0].get('vpid')
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ entries.append({
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ else:
+ # data-playable without vpid but with a playlist.sxml URLs
+ # in otherSettings.playlist (e.g.
+ # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
+ playlist = data_playable.get('otherSettings', {}).get('playlist', {})
+ if playlist:
+ entries.append(self._extract_from_playlist_sxml(
+ playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
+
+ if entries:
+ playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
+ playlist_description = playlist_description or self._og_search_description(webpage, default=None)
+ return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+ # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+ programme_id = self._search_regex(
+ [r'data-video-player-vpid="([\da-z]{8})"',
+ r'<param[^>]+name="externalIdentifier"[^>]+value="([\da-z]{8})"'],
+ webpage, 'vpid', default=None)
+
+ if programme_id:
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
+ digital_data = self._parse_json(
+ self._search_regex(
+ r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
+ programme_id, fatal=False)
+ page_info = digital_data.get('page', {}).get('pageInfo', {})
+ title = page_info.get('pageName') or self._og_search_title(webpage)
+ description = page_info.get('description') or self._og_search_description(webpage)
+ timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ playlist_title = self._html_search_regex(
+ r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
+ playlist_description = self._og_search_description(webpage, default=None)
+
+ def extract_all(pattern):
+ return list(filter(None, map(
+ lambda s: self._parse_json(s, playlist_id, fatal=False),
+ re.findall(pattern, webpage))))
+
+ # Multiple video article (e.g.
+ # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+ EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?'
+ entries = []
+ for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+ embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+ if embed_url and re.match(EMBED_URL, embed_url):
+ entries.append(embed_url)
+ entries.extend(re.findall(
+ r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+ if entries:
+ return self.playlist_result(
+ [self.url_result(entry, 'BBCCoUk') for entry in entries],
+ playlist_id, playlist_title, playlist_description)
+
+ # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
+ medias = extract_all(r"data-media-meta='({[^']+})'")
+
+ if not medias:
+ # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
+ media_asset = self._search_regex(
+ r'mediaAssetPage\.init\(\s*({.+?}), "/',
+ webpage, 'media asset', default=None)
+ if media_asset:
+ media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
+ medias = []
+ for video in media_asset_page.get('videos', {}).values():
+ medias.extend(video.values())
+
+ if not medias:
+ # Multiple video playlist with single `now playing` entry (e.g.
+ # http://www.bbc.com/news/video_and_audio/must_see/33767813)
+ vxp_playlist = self._parse_json(
+ self._search_regex(
+ r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
+ webpage, 'playlist data'),
+ playlist_id)
+ playlist_medias = []
+ for item in vxp_playlist:
+ media = item.get('media')
+ if not media:
+ continue
+ playlist_medias.append(media)
+ # Download single video if found media with asset id matching the video id from URL
+ if item.get('advert', {}).get('assetId') == playlist_id:
+ medias = [media]
+ break
+ # Fallback to the whole playlist
+ if not medias:
+ medias = playlist_medias
+
+ entries = []
+ for num, media_meta in enumerate(medias, start=1):
+ formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
+ if not formats:
+ continue
+ self._sort_formats(formats)
+
+ video_id = media_meta.get('externalId')
+ if not video_id:
+ video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
+
+ title = media_meta.get('caption')
+ if not title:
+ title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
+
+ duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
+
+ images = []
+ for image in media_meta.get('images', {}).values():
+ images.extend(image.values())
+ if 'image' in media_meta:
+ images.append(media_meta['image'])
+
+ thumbnails = [{
+ 'url': image.get('href'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in images]
+
+ entries.append({
+ 'id': video_id,
+ 'title': title,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+
+ return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+
+class BBCCoUkArticleIE(InfoExtractor):
+ _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
+ IE_NAME = 'bbc.co.uk:article'
+ IE_DESC = 'BBC articles'
+
+ _TEST = {
+ 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
+ 'info_dict': {
+ 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
+ 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
+ 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
+ },
+ 'playlist_count': 4,
+ 'add_ie': ['BBCCoUk'],
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage).strip()
+
+ entries = [self.url_result(programme_url) for programme_url in re.findall(
+ r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py
deleted file mode 100644
index 126c8824c..000000000
--- a/youtube_dl/extractor/bbccouk.py
+++ /dev/null
@@ -1,340 +0,0 @@
-from __future__ import unicode_literals
-
-import xml.etree.ElementTree
-
-from .subtitles import SubtitlesInfoExtractor
-from ..utils import ExtractorError
-from ..compat import compat_HTTPError
-
-
-class BBCCoUkIE(SubtitlesInfoExtractor):
- IE_NAME = 'bbc.co.uk'
- IE_DESC = 'BBC iPlayer'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
-
- _TESTS = [
- {
- 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
- 'info_dict': {
- 'id': 'b039d07m',
- 'ext': 'flv',
- 'title': 'Kaleidoscope, Leonard Cohen',
- 'description': 'The Canadian poet and songwriter reflects on his musical career.',
- 'duration': 1740,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- },
- {
- 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
- 'info_dict': {
- 'id': 'b00yng1d',
- 'ext': 'flv',
- 'title': 'The Man in Black: Series 3: The Printed Name',
- 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
- 'duration': 1800,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- 'skip': 'Episode is no longer available on BBC iPlayer Radio',
- },
- {
- 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
- 'info_dict': {
- 'id': 'b00yng1d',
- 'ext': 'flv',
- 'title': 'The Voice UK: Series 3: Blind Auditions 5',
- 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
- 'duration': 5100,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
- },
- {
- 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
- 'info_dict': {
- 'id': 'b03k3pb7',
- 'ext': 'flv',
- 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
- 'description': '2. Invasion',
- 'duration': 3600,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
- }, {
- 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
- 'info_dict': {
- 'id': 'b04v209v',
- 'ext': 'flv',
- 'title': 'Pete Tong, The Essential New Tune Special',
- 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
- 'duration': 10800,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3',
- 'note': 'Audio',
- 'info_dict': {
- 'id': 'p02frcch',
- 'ext': 'flv',
- 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix',
- 'description': 'French house superstar Madeon takes us out of the club and onto the after party.',
- 'duration': 3507,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
- 'note': 'Video',
- 'info_dict': {
- 'id': 'p025c103',
- 'ext': 'flv',
- 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
- 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
- 'duration': 226,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
- 'only_matching': True,
- }, {
- 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
- 'only_matching': True,
- }, {
- 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
- 'only_matching': True,
- }
- ]
-
- def _extract_asx_playlist(self, connection, programme_id):
- asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
- return [ref.get('href') for ref in asx.findall('./Entry/ref')]
-
- def _extract_connection(self, connection, programme_id):
- formats = []
- protocol = connection.get('protocol')
- supplier = connection.get('supplier')
- if protocol == 'http':
- href = connection.get('href')
- # ASX playlist
- if supplier == 'asx':
- for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
- formats.append({
- 'url': ref,
- 'format_id': 'ref%s_%s' % (i, supplier),
- })
- # Direct link
- else:
- formats.append({
- 'url': href,
- 'format_id': supplier,
- })
- elif protocol == 'rtmp':
- application = connection.get('application', 'ondemand')
- auth_string = connection.get('authString')
- identifier = connection.get('identifier')
- server = connection.get('server')
- formats.append({
- 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
- 'play_path': identifier,
- 'app': '%s?%s' % (application, auth_string),
- 'page_url': 'http://www.bbc.co.uk',
- 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
- 'rtmp_live': False,
- 'ext': 'flv',
- 'format_id': supplier,
- })
- return formats
-
- def _extract_items(self, playlist):
- return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
-
- def _extract_medias(self, media_selection):
- error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error')
- if error is not None:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True)
- return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
-
- def _extract_connections(self, media):
- return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
-
- def _extract_video(self, media, programme_id):
- formats = []
- vbr = int(media.get('bitrate'))
- vcodec = media.get('encoding')
- service = media.get('service')
- width = int(media.get('width'))
- height = int(media.get('height'))
- file_size = int(media.get('media_file_size'))
- for connection in self._extract_connections(media):
- conn_formats = self._extract_connection(connection, programme_id)
- for format in conn_formats:
- format.update({
- 'format_id': '%s_%s' % (service, format['format_id']),
- 'width': width,
- 'height': height,
- 'vbr': vbr,
- 'vcodec': vcodec,
- 'filesize': file_size,
- })
- formats.extend(conn_formats)
- return formats
-
- def _extract_audio(self, media, programme_id):
- formats = []
- abr = int(media.get('bitrate'))
- acodec = media.get('encoding')
- service = media.get('service')
- for connection in self._extract_connections(media):
- conn_formats = self._extract_connection(connection, programme_id)
- for format in conn_formats:
- format.update({
- 'format_id': '%s_%s' % (service, format['format_id']),
- 'abr': abr,
- 'acodec': acodec,
- })
- formats.extend(conn_formats)
- return formats
-
- def _extract_captions(self, media, programme_id):
- subtitles = {}
- for connection in self._extract_connections(media):
- captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
- lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
- ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
- srt = ''
- for pos, p in enumerate(ps):
- srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
- p.text.strip() if p.text is not None else '')
- subtitles[lang] = srt
- return subtitles
-
- def _download_media_selector(self, programme_id):
- try:
- media_selection = self._download_xml(
- 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
- programme_id, 'Downloading media selection XML')
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
- media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8'))
- else:
- raise
-
- formats = []
- subtitles = None
-
- for media in self._extract_medias(media_selection):
- kind = media.get('kind')
- if kind == 'audio':
- formats.extend(self._extract_audio(media, programme_id))
- elif kind == 'video':
- formats.extend(self._extract_video(media, programme_id))
- elif kind == 'captions':
- subtitles = self._extract_captions(media, programme_id)
-
- return formats, subtitles
-
- def _download_playlist(self, playlist_id):
- try:
- playlist = self._download_json(
- 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
- playlist_id, 'Downloading playlist JSON')
-
- version = playlist.get('defaultAvailableVersion')
- if version:
- smp_config = version['smpConfig']
- title = smp_config['title']
- description = smp_config['summary']
- for item in smp_config['items']:
- kind = item['kind']
- if kind != 'programme' and kind != 'radioProgramme':
- continue
- programme_id = item.get('vpid')
- duration = int(item.get('duration'))
- formats, subtitles = self._download_media_selector(programme_id)
- return programme_id, title, description, duration, formats, subtitles
- except ExtractorError as ee:
- if not isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
- raise
-
- # fallback to legacy playlist
- playlist = self._download_xml(
- 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id,
- playlist_id, 'Downloading legacy playlist XML')
-
- no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
- if no_items is not None:
- reason = no_items.get('reason')
- if reason == 'preAvailability':
- msg = 'Episode %s is not yet available' % playlist_id
- elif reason == 'postAvailability':
- msg = 'Episode %s is no longer available' % playlist_id
- elif reason == 'noMedia':
- msg = 'Episode %s is not currently available' % playlist_id
- else:
- msg = 'Episode %s is not available: %s' % (playlist_id, reason)
- raise ExtractorError(msg, expected=True)
-
- for item in self._extract_items(playlist):
- kind = item.get('kind')
- if kind != 'programme' and kind != 'radioProgramme':
- continue
- title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
- description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
- programme_id = item.get('identifier')
- duration = int(item.get('duration'))
- formats, subtitles = self._download_media_selector(programme_id)
-
- return programme_id, title, description, duration, formats, subtitles
-
- def _real_extract(self, url):
- group_id = self._match_id(url)
-
- webpage = self._download_webpage(url, group_id, 'Downloading video page')
-
- programme_id = self._search_regex(
- r'"vpid"\s*:\s*"([\da-z]{8})"', webpage, 'vpid', fatal=False, default=None)
- if programme_id:
- player = self._download_json(
- 'http://www.bbc.co.uk/iplayer/episode/%s.json' % group_id,
- group_id)['jsConf']['player']
- title = player['title']
- description = player['subtitle']
- duration = player['duration']
- formats, subtitles = self._download_media_selector(programme_id)
- else:
- programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(programme_id, subtitles)
- return
-
- self._sort_formats(formats)
-
- return {
- 'id': programme_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'formats': formats,
- 'subtitles': subtitles,
- }
diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py
new file mode 100644
index 000000000..3c7775d3e
--- /dev/null
+++ b/youtube_dl/extractor/beatportpro.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import int_or_none
+
+
+class BeatportProIE(InfoExtractor):
+ _VALID_URL = r'https?://pro\.beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371',
+ 'md5': 'b3c34d8639a2f6a7f734382358478887',
+ 'info_dict': {
+ 'id': '5379371',
+ 'display_id': 'synesthesia-original-mix',
+ 'ext': 'mp4',
+ 'title': 'Froxic - Synesthesia (Original Mix)',
+ },
+ }, {
+ 'url': 'https://pro.beatport.com/track/love-and-war-original-mix/3756896',
+ 'md5': 'e44c3025dfa38c6577fbaeb43da43514',
+ 'info_dict': {
+ 'id': '3756896',
+ 'display_id': 'love-and-war-original-mix',
+ 'ext': 'mp3',
+ 'title': 'Wolfgang Gartner - Love & War (Original Mix)',
+ },
+ }, {
+ 'url': 'https://pro.beatport.com/track/birds-original-mix/4991738',
+ 'md5': 'a1fd8e8046de3950fd039304c186c05f',
+ 'info_dict': {
+ 'id': '4991738',
+ 'display_id': 'birds-original-mix',
+ 'ext': 'mp4',
+ 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)",
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ track_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ playables = self._parse_json(
+ self._search_regex(
+ r'window\.Playables\s*=\s*({.+?});', webpage,
+ 'playables info', flags=re.DOTALL),
+ track_id)
+
+ track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
+
+ title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
+ if track['mix']:
+ title += ' (' + track['mix'] + ')'
+
+ formats = []
+ for ext, info in track['preview'].items():
+ if not info['url']:
+ continue
+ fmt = {
+ 'url': info['url'],
+ 'ext': ext,
+ 'format_id': ext,
+ 'vcodec': 'none',
+ }
+ if ext == 'mp3':
+ fmt['preference'] = 0
+ fmt['acodec'] = 'mp3'
+ fmt['abr'] = 96
+ fmt['asr'] = 44100
+ elif ext == 'mp4':
+ fmt['preference'] = 1
+ fmt['acodec'] = 'aac'
+ fmt['abr'] = 96
+ fmt['asr'] = 44100
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ images = []
+ for name, info in track['images'].items():
+ image_url = info.get('url')
+ if name == 'dynamic' or not image_url:
+ continue
+ image = {
+ 'id': name,
+ 'url': image_url,
+ 'height': int_or_none(info.get('height')),
+ 'width': int_or_none(info.get('width')),
+ }
+ images.append(image)
+
+ return {
+ 'id': compat_str(track.get('id')) or track_id,
+ 'display_id': track.get('slug') or display_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': images,
+ }
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index 4e79fea8f..61bc2f744 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -1,65 +1,69 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'
_TEST = {
'url': 'http://beeg.com/5416503',
- 'md5': '634526ae978711f6b748fe0dd6c11f57',
+ 'md5': '46c384def73b33dbc581262e5ee67cef',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
'title': 'Sultry Striptease',
- 'description': 'md5:6db3c6177972822aaba18652ff59c773',
- 'categories': list, # NSFW
- 'thumbnail': 're:https?://.*\.jpg$',
+ 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2',
+ 'timestamp': 1391813355,
+ 'upload_date': '20140207',
+ 'duration': 383,
+ 'tags': list,
'age_limit': 18,
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- webpage = self._download_webpage(url, video_id)
-
- quality_arr = self._search_regex(
- r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats')
+ video_id = self._match_id(url)
- formats = [{
- 'url': fmt[1],
- 'format_id': fmt[0],
- 'height': int(fmt[0][:-1]),
- } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)]
+ video = self._download_json(
+ 'http://beeg.com/api/v1/video/%s' % video_id, video_id)
+ formats = []
+ for format_id, video_url in video.items():
+ if not video_url:
+ continue
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
+ if not height:
+ continue
+ formats.append({
+ 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'),
+ 'format_id': format_id,
+ 'height': int(height),
+ })
self._sort_formats(formats)
- title = self._html_search_regex(
- r'<title>([^<]+)\s*-\s*beeg\.?</title>', webpage, 'title')
+ title = video['title']
+ video_id = video.get('id') or video_id
+ display_id = video.get('code')
+ description = video.get('desc')
- description = self._html_search_regex(
- r'<meta name="description" content="([^"]*)"',
- webpage, 'description', fatal=False)
- thumbnail = self._html_search_regex(
- r'\'previewer.url\'\s*:\s*"([^"]*)"',
- webpage, 'thumbnail', fatal=False)
+ timestamp = parse_iso8601(video.get('date'), ' ')
+ duration = int_or_none(video.get('duration'))
- categories_str = self._html_search_regex(
- r'<meta name="keywords" content="([^"]+)"', webpage, 'categories', fatal=False)
- categories = (
- None if categories_str is None
- else categories_str.split(','))
+ tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
return {
'id': video_id,
+ 'display_id': display_id,
'title': title,
'description': description,
- 'thumbnail': thumbnail,
- 'categories': categories,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'tags': tags,
'formats': formats,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py
index d2abd4d77..03dad4636 100644
--- a/youtube_dl/extractor/bet.py
+++ b/youtube_dl/extractor/bet.py
@@ -1,7 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
xpath_text,
xpath_with_ns,
@@ -16,11 +16,11 @@ class BetIE(InfoExtractor):
{
'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
'info_dict': {
- 'id': '740ab250-bb94-4a8a-8787-fe0de7c74471',
+ 'id': 'news/national/2014/a-conversation-with-president-obama',
'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
'ext': 'flv',
- 'title': 'BET News Presents: A Conversation With President Obama',
- 'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6',
+ 'title': 'A Conversation With President Obama',
+ 'description': 'md5:699d0652a350cf3e491cd15cc745b5da',
'duration': 1534,
'timestamp': 1418075340,
'upload_date': '20141208',
@@ -35,7 +35,7 @@ class BetIE(InfoExtractor):
{
'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
'info_dict': {
- 'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d',
+ 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts',
'display_id': 'justice-for-ferguson-a-community-reacts',
'ext': 'flv',
'title': 'Justice for Ferguson: A Community Reacts',
@@ -57,10 +57,13 @@ class BetIE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- media_url = compat_urllib_parse.unquote(self._search_regex(
+ media_url = compat_urllib_parse_unquote(self._search_regex(
[r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
webpage, 'media URL'))
+ video_id = self._search_regex(
+ r'/video/(.*)/_jcr_content/', media_url, 'video id')
+
mrss = self._download_xml(media_url, display_id)
item = mrss.find('./channel/item')
@@ -75,8 +78,6 @@ class BetIE(InfoExtractor):
description = xpath_text(
item, './description', 'description', fatal=False)
- video_id = xpath_text(item, './guid', 'video id', fatal=False)
-
timestamp = parse_iso8601(xpath_text(
item, xpath_with_ns('./dc:date', NS_MAP),
'upload date', fatal=False))
diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py
index 77b562d99..1a0184861 100644
--- a/youtube_dl/extractor/bild.py
+++ b/youtube_dl/extractor/bild.py
@@ -2,7 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ unescapeHTML,
+)
class BildIE(InfoExtractor):
@@ -14,26 +17,24 @@ class BildIE(InfoExtractor):
'info_dict': {
'id': '38184146',
'ext': 'mp4',
- 'title': 'BILD hat sie getestet',
- 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg',
+ 'title': 'Das können die neuen iPads',
+ 'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f',
+ 'thumbnail': 're:^https?://.*\.jpg$',
'duration': 196,
- 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
- xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml"
- doc = self._download_xml(xml_url, video_id)
-
- duration = int_or_none(doc.attrib.get('duration'), scale=1000)
+ video_data = self._download_json(
+ url.split('.bild.html')[0] + ',view=json.bild.html', video_id)
return {
'id': video_id,
- 'title': doc.attrib['ueberschrift'],
- 'description': doc.attrib.get('text'),
- 'url': doc.attrib['src'],
- 'thumbnail': doc.attrib.get('img'),
- 'duration': duration,
+ 'title': unescapeHTML(video_data['title']).strip(),
+ 'description': unescapeHTML(video_data.get('description')),
+ 'url': video_data['clipList'][0]['srces'][0]['src'],
+ 'thumbnail': video_data.get('poster'),
+ 'duration': int_or_none(video_data.get('durationSec')),
}
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 75d744852..6c66a1236 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -2,34 +2,56 @@
from __future__ import unicode_literals
import re
+import itertools
+import json
from .common import InfoExtractor
+from ..compat import (
+ compat_etree_fromstring,
+)
from ..utils import (
int_or_none,
unified_strdate,
+ ExtractorError,
)
class BiliBiliIE(InfoExtractor):
_VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.bilibili.tv/video/av1074402/',
'md5': '2c301e4dab317596e837c3e7633e7d86',
'info_dict': {
- 'id': '1074402',
+ 'id': '1074402_part1',
'ext': 'flv',
'title': '【金坷垃】金泡沫',
'duration': 308,
'upload_date': '20140420',
'thumbnail': 're:^https?://.+\.jpg',
},
- }
+ }, {
+ 'url': 'http://www.bilibili.com/video/av1041170/',
+ 'info_dict': {
+ 'id': '1041170',
+ 'title': '【BD1080P】刀语【诸神&异域】',
+ },
+ 'playlist_count': 9,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ if '(此视频不存在或被删除)' in webpage:
+ raise ExtractorError(
+ 'The video does not exist or was deleted', expected=True)
+
+ if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage:
+ raise ExtractorError(
+ 'The video is not available in your region due to copyright reasons',
+ expected=True)
+
video_code = self._search_regex(
r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
@@ -54,19 +76,22 @@ class BiliBiliIE(InfoExtractor):
cid = self._search_regex(r'cid=(\d+)', webpage, 'cid')
- lq_doc = self._download_xml(
+ entries = []
+
+ lq_page = self._download_webpage(
'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
video_id,
note='Downloading LQ video info'
)
- lq_durl = lq_doc.find('./durl')
- formats = [{
- 'format_id': 'lq',
- 'quality': 1,
- 'url': lq_durl.find('./url').text,
- 'filesize': int_or_none(
- lq_durl.find('./size'), get_attr='text'),
- }]
+ try:
+ err_info = json.loads(lq_page)
+ raise ExtractorError(
+ 'BiliBili said: ' + err_info['error_text'], expected=True)
+ except ValueError:
+ pass
+
+ lq_doc = compat_etree_fromstring(lq_page)
+ lq_durls = lq_doc.findall('./durl')
hq_doc = self._download_xml(
'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid,
@@ -75,22 +100,45 @@ class BiliBiliIE(InfoExtractor):
fatal=False,
)
if hq_doc is not False:
- hq_durl = hq_doc.find('./durl')
- formats.append({
- 'format_id': 'hq',
- 'quality': 2,
- 'ext': 'flv',
- 'url': hq_durl.find('./url').text,
+ hq_durls = hq_doc.findall('./durl')
+ assert len(lq_durls) == len(hq_durls)
+ else:
+ hq_durls = itertools.repeat(None)
+
+ i = 1
+ for lq_durl, hq_durl in zip(lq_durls, hq_durls):
+ formats = [{
+ 'format_id': 'lq',
+ 'quality': 1,
+ 'url': lq_durl.find('./url').text,
'filesize': int_or_none(
- hq_durl.find('./size'), get_attr='text'),
+ lq_durl.find('./size'), get_attr='text'),
+ }]
+ if hq_durl is not None:
+ formats.append({
+ 'format_id': 'hq',
+ 'quality': 2,
+ 'ext': 'flv',
+ 'url': hq_durl.find('./url').text,
+ 'filesize': int_or_none(
+ hq_durl.find('./size'), get_attr='text'),
+ })
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': '%s_part%d' % (video_id, i),
+ 'title': title,
+ 'formats': formats,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
})
- self._sort_formats(formats)
+ i += 1
+
return {
+ '_type': 'multi_video',
+ 'entries': entries,
'id': video_id,
- 'title': title,
- 'formats': formats,
- 'duration': duration,
- 'upload_date': upload_date,
- 'thumbnail': thumbnail,
+ 'title': title
}
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py
index 3e461e715..3b8eabe8f 100644
--- a/youtube_dl/extractor/blinkx.py
+++ b/youtube_dl/extractor/blinkx.py
@@ -1,40 +1,35 @@
from __future__ import unicode_literals
import json
-import re
from .common import InfoExtractor
-from ..utils import remove_start
+from ..utils import (
+ remove_start,
+ int_or_none,
+)
class BlinkxIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
+ _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
IE_NAME = 'blinkx'
_TEST = {
- 'url': 'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB',
- 'md5': '2e9a07364af40163a908edbf10bb2492',
+ 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
+ 'md5': '337cf7a344663ec79bf93a526a2e06c7',
'info_dict': {
- 'id': '8aQUy7GV',
+ 'id': 'Da0Gw3xc',
'ext': 'mp4',
- 'title': 'Police Car Rolls Away',
- 'uploader': 'stupidvideos.com',
- 'upload_date': '20131215',
- 'timestamp': 1387068000,
- 'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!',
- 'duration': 14.886,
- 'thumbnails': [{
- 'width': 100,
- 'height': 76,
- 'resolution': '100x76',
- 'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg',
- }],
+ 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
+ 'uploader': 'IGN News',
+ 'upload_date': '20150217',
+ 'timestamp': 1424215740,
+ 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
+ 'duration': 47.743333,
},
}
- def _real_extract(self, rl):
- m = re.match(self._VALID_URL, rl)
- video_id = m.group('id')
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
display_id = video_id[:8]
api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' +
@@ -60,18 +55,20 @@ class BlinkxIE(InfoExtractor):
elif m['type'] in ('flv', 'mp4'):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
- tbr = (int(m['vbr']) + int(m['abr'])) // 1000
+ vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
+ abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
+ tbr = vbr + abr if vbr and abr else None
format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
formats.append({
'format_id': format_id,
'url': m['link'],
'vcodec': vcodec,
'acodec': acodec,
- 'abr': int(m['abr']) // 1000,
- 'vbr': int(m['vbr']) // 1000,
+ 'abr': abr,
+ 'vbr': vbr,
'tbr': tbr,
- 'width': int(m['w']),
- 'height': int(m['h']),
+ 'width': int_or_none(m.get('w')),
+ 'height': int_or_none(m.get('h')),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index 436cc5155..35375f7b1 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -3,31 +3,29 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .subtitles import SubtitlesInfoExtractor
-from ..compat import (
- compat_str,
- compat_urllib_request,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
clean_html,
int_or_none,
parse_iso8601,
+ sanitized_Request,
unescapeHTML,
+ xpath_text,
+ xpath_with_ns,
)
-class BlipTVIE(SubtitlesInfoExtractor):
+class BlipTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))'
_TESTS = [
{
'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
- 'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+ 'md5': '80baf1ec5c3d2019037c1c707d676b9f',
'info_dict': {
'id': '5779306',
- 'ext': 'mov',
+ 'ext': 'm4v',
'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
'timestamp': 1323138843,
@@ -101,8 +99,31 @@ class BlipTVIE(SubtitlesInfoExtractor):
'vcodec': 'none',
}
},
+ {
+ # missing duration
+ 'url': 'http://blip.tv/rss/flash/6700880',
+ 'info_dict': {
+ 'id': '6684191',
+ 'ext': 'm4v',
+ 'title': 'Cowboy Bebop: Gateway Shuffle Review',
+ 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8',
+ 'timestamp': 1386639757,
+ 'upload_date': '20131210',
+ 'uploader': 'sfdebris',
+ 'uploader_id': '706520',
+ }
+ }
]
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
+ if mobj:
+ return 'http://blip.tv/a/a-' + mobj.group(1)
+ mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
+ if mobj:
+ return mobj.group(1)
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
lookup_id = mobj.group('lookup_id')
@@ -120,35 +141,34 @@ class BlipTVIE(SubtitlesInfoExtractor):
rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
- def blip(s):
- return '{http://blip.tv/dtd/blip/1.0}%s' % s
-
- def media(s):
- return '{http://search.yahoo.com/mrss/}%s' % s
-
- def itunes(s):
- return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s
+ def _x(p):
+ return xpath_with_ns(p, {
+ 'blip': 'http://blip.tv/dtd/blip/1.0',
+ 'media': 'http://search.yahoo.com/mrss/',
+ 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+ })
item = rss.find('channel/item')
- video_id = item.find(blip('item_id')).text
- title = item.find('./title').text
- description = clean_html(compat_str(item.find(blip('puredescription')).text))
- timestamp = parse_iso8601(item.find(blip('datestamp')).text)
- uploader = item.find(blip('user')).text
- uploader_id = item.find(blip('userid')).text
- duration = int(item.find(blip('runtime')).text)
- media_thumbnail = item.find(media('thumbnail'))
- thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
- categories = [category.text for category in item.findall('category')]
+ video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id
+ title = xpath_text(item, 'title', 'title', fatal=True)
+ description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description'))
+ timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp'))
+ uploader = xpath_text(item, _x('blip:user'), 'uploader')
+ uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id')
+ duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration'))
+ media_thumbnail = item.find(_x('media:thumbnail'))
+ thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None
+ else xpath_text(item, 'image', 'thumbnail'))
+ categories = [category.text for category in item.findall('category') if category is not None]
formats = []
- subtitles = {}
+ subtitles_urls = {}
- media_group = item.find(media('group'))
- for media_content in media_group.findall(media('content')):
+ media_group = item.find(_x('media:group'))
+ for media_content in media_group.findall(_x('media:content')):
url = media_content.get('url')
- role = media_content.get(blip('role'))
+ role = media_content.get(_x('blip:role'))
msg = self._download_webpage(
url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
video_id, 'Resolving URL for %s' % role)
@@ -161,25 +181,22 @@ class BlipTVIE(SubtitlesInfoExtractor):
}
lang = role.rpartition('-')[-1].strip().lower()
langcode = LANGS.get(lang, lang)
- subtitles[langcode] = url
+ subtitles_urls[langcode] = url
elif media_type.startswith('video/'):
formats.append({
'url': real_url,
'format_id': role,
'format_note': media_type,
- 'vcodec': media_content.get(blip('vcodec')) or 'none',
- 'acodec': media_content.get(blip('acodec')),
+ 'vcodec': media_content.get(_x('blip:vcodec')) or 'none',
+ 'acodec': media_content.get(_x('blip:acodec')),
'filesize': media_content.get('filesize'),
'width': int_or_none(media_content.get('width')),
'height': int_or_none(media_content.get('height')),
})
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
- # subtitles
- video_subtitles = self.extract_subtitles(video_id, subtitles)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
+ subtitles = self.extract_subtitles(video_id, subtitles_urls)
return {
'id': video_id,
@@ -192,15 +209,22 @@ class BlipTVIE(SubtitlesInfoExtractor):
'thumbnail': thumbnail,
'categories': categories,
'formats': formats,
- 'subtitles': video_subtitles,
+ 'subtitles': subtitles,
}
- def _download_subtitle_url(self, sub_lang, url):
- # For some weird reason, blip.tv serves a video instead of subtitles
- # when we request with a common UA
- req = compat_urllib_request.Request(url)
- req.add_header('User-Agent', 'youtube-dl')
- return self._download_webpage(req, None, note=False)
+ def _get_subtitles(self, video_id, subtitles_urls):
+ subtitles = {}
+ for lang, url in subtitles_urls.items():
+ # For some weird reason, blip.tv serves a video instead of subtitles
+ # when we request with a common UA
+ req = sanitized_Request(url)
+ req.add_header('User-Agent', 'youtube-dl')
+ subtitles[lang] = [{
+ # The extension is 'srt' but it's actually an 'ass' file
+ 'ext': 'ass',
+ 'data': self._download_webpage(req, None, note=False),
+ }]
+ return subtitles
class BlipTVUserIE(InfoExtractor):
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py
index c51a97ce4..ebeef8f2a 100644
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@@ -6,32 +6,56 @@ from .common import InfoExtractor
class BloombergIE(InfoExtractor):
- _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
+ _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
- _TEST = {
- 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
+ _TESTS = [{
+ 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
# The md5 checksum changes
'info_dict': {
'id': 'qurhIVlJSB6hzkVi229d8g',
'ext': 'flv',
'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
- 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88',
+ 'description': 'md5:a8ba0302912d03d246979735c17d2761',
},
- }
+ }, {
+ 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
+ name = self._match_id(url)
webpage = self._download_webpage(url, name)
- f4m_url = self._search_regex(
- r'<source src="(https?://[^"]+\.f4m.*?)"', webpage,
- 'f4m url')
+ video_id = self._search_regex(
+ r'["\']bmmrId["\']\s*:\s*(["\'])(?P<url>.+?)\1',
+ webpage, 'id', group='url')
title = re.sub(': Video$', '', self._og_search_title(webpage))
+ embed_info = self._download_json(
+ 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
+ formats = []
+ for stream in embed_info['streams']:
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ if stream['muxing_format'] == 'TS':
+ m3u8_formats = self._extract_m3u8_formats(
+ stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ else:
+ f4m_formats = self._extract_f4m_formats(
+ stream_url, video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ self._sort_formats(formats)
+
return {
- 'id': name.split('-')[-1],
+ 'id': video_id,
'title': title,
- 'formats': self._extract_f4m_formats(f4m_url, name),
+ 'formats': formats,
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index 45ba51732..66e394e10 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -16,27 +16,38 @@ class BRIE(InfoExtractor):
_TESTS = [
{
- 'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html',
- 'md5': '93556dd2bcb2948d9259f8670c516d59',
+ 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
+ 'md5': '83a0477cf0b8451027eb566d88b51106',
'info_dict': {
- 'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a',
+ 'id': '48f656ef-287e-486f-be86-459122db22cc',
'ext': 'mp4',
- 'title': 'Wenn das Traditions-Theater wackelt',
- 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt',
- 'duration': 34,
- 'uploader': 'BR',
- 'upload_date': '20140802',
+ 'title': 'Die böse Überraschung',
+ 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung',
+ 'duration': 180,
+ 'uploader': 'Reinhard Weber',
+ 'upload_date': '20150422',
}
},
{
- 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html',
- 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820',
+ 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
+ 'md5': 'a44396d73ab6a68a69a568fae10705bb',
'info_dict': {
- 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab',
+ 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
+ 'ext': 'mp4',
+ 'title': 'Manfred Schreiber ist tot',
+ 'description': 'Abendschau kompakt: Manfred Schreiber ist tot',
+ 'duration': 26,
+ }
+ },
+ {
+ 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html',
+ 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
+ 'info_dict': {
+ 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
'ext': 'aac',
- 'title': '"Keine neuen Schulden im nächsten Jahr"',
- 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"',
- 'duration': 64,
+ 'title': 'Kurzweilig und sehr bewegend',
+ 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend',
+ 'duration': 296,
}
},
{
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py
index 4bcc897c9..aa08051b1 100644
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -18,6 +18,7 @@ class BreakIE(InfoExtractor):
'id': '2468056',
'ext': 'mp4',
'title': 'When Girls Act Like D-Bags',
+ 'age_limit': 13,
}
}, {
'url': 'http://www.break.com/video/ugc/baby-flex-2773063',
@@ -41,7 +42,7 @@ class BreakIE(InfoExtractor):
'tbr': media['bitRate'],
'width': media['width'],
'height': media['height'],
- } for media in info['media']]
+ } for media in info['media'] if media.get('mediaPurpose') == 'play']
if not formats:
formats.append({
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index ea0969d4d..f5ebae1e6 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -3,28 +3,34 @@ from __future__ import unicode_literals
import re
import json
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_parse_qs,
compat_str,
compat_urllib_parse,
compat_urllib_parse_urlparse,
- compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
)
from ..utils import (
determine_ext,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
+ float_or_none,
+ js_to_json,
+ int_or_none,
+ parse_iso8601,
+ sanitized_Request,
unescapeHTML,
unsmuggle_url,
)
-class BrightcoveIE(InfoExtractor):
+class BrightcoveLegacyIE(InfoExtractor):
+ IE_NAME = 'brightcove:legacy'
_VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
@@ -95,6 +101,7 @@ class BrightcoveIE(InfoExtractor):
'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
'info_dict': {
'title': 'Sealife',
+ 'id': '3550319591001',
},
'playlist_mincount': 7,
},
@@ -116,7 +123,10 @@ class BrightcoveIE(InfoExtractor):
object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
object_str = fix_xml_ampersands(object_str)
- object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
+ try:
+ object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
+ except compat_xml_parse_error:
+ return
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None:
@@ -152,6 +162,28 @@ class BrightcoveIE(InfoExtractor):
linkBase = find_param('linkBaseURL')
if linkBase is not None:
params['linkBaseURL'] = linkBase
+ return cls._make_brightcove_url(params)
+
+ @classmethod
+ def _build_brighcove_url_from_js(cls, object_js):
+ # The layout of JS is as follows:
+ # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+ # // build Brightcove <object /> XML
+ # }
+ m = re.search(
+ r'''(?x)customBC.\createVideo\(
+ .*? # skipping width and height
+ ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID
+ ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters
+ # in length, however it's appended to itself
+ # in places, so truncate
+ ["\'](?P<videoID>\d+)["\'] # @videoPlayer
+ ''', object_js)
+ if m:
+ return cls._make_brightcove_url(m.groupdict())
+
+ @classmethod
+ def _make_brightcove_url(cls, params):
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
@@ -168,7 +200,7 @@ class BrightcoveIE(InfoExtractor):
"""Return a list of all Brightcove URLs from the webpage """
url_m = re.search(
- r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"',
+ r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]',
webpage)
if url_m:
url = unescapeHTML(url_m.group(1))
@@ -182,9 +214,14 @@ class BrightcoveIE(InfoExtractor):
(?:
[^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
[^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
- ).+?</object>''',
+ ).+?>\s*</object>''',
webpage)
- return [cls._build_brighcove_url(m) for m in matches]
+ if matches:
+ return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+ return list(filter(None, [
+ cls._build_brighcove_url_from_js(custom_bc)
+ for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -213,7 +250,7 @@ class BrightcoveIE(InfoExtractor):
def _get_video_info(self, video_id, query_str, query, referer=None):
request_url = self._FEDERATED_URL_TEMPLATE % query_str
- req = compat_urllib_request.Request(request_url)
+ req = sanitized_Request(request_url)
linkBase = query.get('linkBaseURL')
if linkBase is not None:
referer = linkBase[0]
@@ -247,7 +284,7 @@ class BrightcoveIE(InfoExtractor):
playlist_info = json_data['videoList']
videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
- return self.playlist_result(videos, playlist_id=playlist_info['id'],
+ return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
def _extract_video_info(self, video_info):
@@ -314,3 +351,172 @@ class BrightcoveIE(InfoExtractor):
if 'url' not in info and not info.get('formats'):
raise ExtractorError('Unable to extract video url for %s' % info['id'])
return info
+
+
+class BrightcoveNewIE(InfoExtractor):
+ IE_NAME = 'brightcove:new'
+ _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+)'
+ _TESTS = [{
+ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
+ 'md5': 'c8100925723840d4b0d243f7025703be',
+ 'info_dict': {
+ 'id': '4463358922001',
+ 'ext': 'mp4',
+ 'title': 'Meet the man behind Popcorn Time',
+ 'description': 'md5:eac376a4fe366edc70279bfb681aea16',
+ 'duration': 165.768,
+ 'timestamp': 1441391203,
+ 'upload_date': '20150904',
+ 'uploader_id': '929656772001',
+ 'formats': 'mincount:22',
+ },
+ }, {
+ # with rtmp streams
+ 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001',
+ 'info_dict': {
+ 'id': '4279049078001',
+ 'ext': 'mp4',
+ 'title': 'Titansgrave: Chapter 0',
+ 'description': 'Titansgrave: Chapter 0',
+ 'duration': 1242.058,
+ 'timestamp': 1433556729,
+ 'upload_date': '20150606',
+ 'uploader_id': '4036320279001',
+ 'formats': 'mincount:41',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ # Reference:
+ # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
+ # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript)
+ # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
+
+ entries = []
+
+ # Look for iframe embeds [1]
+ for _, url in re.findall(
+ r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
+ entries.append(url)
+
+ # Look for embed_in_page embeds [2]
+ for video_id, account_id, player_id, embed in re.findall(
+ # According to examples from [3] it's unclear whether video id
+ # may be optional and what to do when it is
+ r'''(?sx)
+ <video[^>]+
+ data-video-id=["\'](\d+)["\'][^>]*>.*?
+ </video>.*?
+ <script[^>]+
+ src=["\'](?:https?:)?//players\.brightcove\.net/
+ (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js
+ ''', webpage):
+ entries.append(
+ 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
+ % (account_id, player_id, embed, video_id))
+
+ return entries
+
+ def _real_extract(self, url):
+ account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
+
+ webpage = self._download_webpage(
+ 'http://players.brightcove.net/%s/%s_%s/index.min.js'
+ % (account_id, player_id, embed), video_id)
+
+ policy_key = None
+
+ catalog = self._search_regex(
+ r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
+ if catalog:
+ catalog = self._parse_json(
+ js_to_json(catalog), video_id, fatal=False)
+ if catalog:
+ policy_key = catalog.get('policyKey')
+
+ if not policy_key:
+ policy_key = self._search_regex(
+ r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
+ webpage, 'policy key', group='pk')
+
+ req = sanitized_Request(
+ 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s'
+ % (account_id, video_id),
+ headers={'Accept': 'application/json;pk=%s' % policy_key})
+ json_data = self._download_json(req, video_id)
+
+ title = json_data['name']
+
+ formats = []
+ for source in json_data.get('sources', []):
+ source_type = source.get('type')
+ src = source.get('src')
+ if source_type == 'application/x-mpegURL':
+ if not src:
+ continue
+ m3u8_formats = self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ else:
+ streaming_src = source.get('streaming_src')
+ stream_name, app_name = source.get('stream_name'), source.get('app_name')
+ if not src and not streaming_src and (not stream_name or not app_name):
+ continue
+ tbr = float_or_none(source.get('avg_bitrate'), 1000)
+ height = int_or_none(source.get('height'))
+ f = {
+ 'tbr': tbr,
+ 'width': int_or_none(source.get('width')),
+ 'height': height,
+ 'filesize': int_or_none(source.get('size')),
+ 'container': source.get('container'),
+ 'vcodec': source.get('codec'),
+ 'ext': source.get('container').lower(),
+ }
+
+ def build_format_id(kind):
+ format_id = kind
+ if tbr:
+ format_id += '-%dk' % int(tbr)
+ if height:
+ format_id += '-%dp' % height
+ return format_id
+
+ if src or streaming_src:
+ f.update({
+ 'url': src or streaming_src,
+ 'format_id': build_format_id('http' if src else 'http-streaming'),
+ 'preference': 2 if src else 1,
+ })
+ else:
+ f.update({
+ 'url': app_name,
+ 'play_path': stream_name,
+ 'format_id': build_format_id('rtmp'),
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ description = json_data.get('description')
+ thumbnail = json_data.get('thumbnail')
+ timestamp = parse_iso8601(json_data.get('published_at'))
+ duration = float_or_none(json_data.get('duration'), 1000)
+ tags = json_data.get('tags', [])
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader_id': account_id,
+ 'formats': formats,
+ 'tags': tags,
+ }
diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py
index a5d2af174..df503ecc0 100644
--- a/youtube_dl/extractor/buzzfeed.py
+++ b/youtube_dl/extractor/buzzfeed.py
@@ -33,6 +33,7 @@ class BuzzFeedIE(InfoExtractor):
'skip_download': True, # Got enough YouTube download tests
},
'info_dict': {
+ 'id': 'look-at-this-cute-dog-omg',
'description': 're:Munchkin the Teddy Bear is back ?!',
'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
},
@@ -42,8 +43,8 @@ class BuzzFeedIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20141124',
'uploader_id': 'CindysMunchkin',
- 'description': 're:© 2014 Munchkin the Shih Tzu',
- 'uploader': 'Munchkin the Shih Tzu',
+ 'description': 're:© 2014 Munchkin the',
+ 'uploader': 're:^Munchkin the',
'title': 're:Munchkin the Teddy Bear gets her exercise',
},
}]
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py
index 6252be05b..3b2de517e 100644
--- a/youtube_dl/extractor/byutv.py
+++ b/youtube_dl/extractor/byutv.py
@@ -16,7 +16,7 @@ class BYUtvIE(InfoExtractor):
'ext': 'mp4',
'description': 'md5:5438d33774b6bdc662f9485a340401cc',
'title': 'Season 5 Episode 5',
- 'thumbnail': 're:^https?://.*promo.*'
+ 'thumbnail': 're:^https?://.*\.jpg$'
},
'params': {
'skip_download': True,
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index c4fefefe4..f6a1ff381 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -4,38 +4,53 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import parse_duration
class Canalc2IE(InfoExtractor):
IE_NAME = 'canalc2.tv'
- _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+ 'url': 'http://www.canalc2.tv/video/12163',
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
- 'ext': 'mp4',
- 'title': 'Terrasses du Numérique'
+ 'ext': 'flv',
+ 'title': 'Terrasses du Numérique',
+ 'duration': 122,
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
}
}
def _real_extract(self, url):
- video_id = re.match(self._VALID_URL, url).group('id')
- # We need to set the voir field for getting the file name
- url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- file_name = self._search_regex(
- r"so\.addVariable\('file','(.*?)'\);",
- webpage, 'file name')
- video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
+ video_url = self._search_regex(
+ r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P<file>.+?)\2',
+ webpage, 'video_url', group='file')
+ formats = [{'url': video_url}]
+ if video_url.startswith('rtmp://'):
+ rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
+ formats[0].update({
+ 'url': rtmp.group('url'),
+ 'ext': 'flv',
+ 'app': rtmp.group('app'),
+ 'play_path': rtmp.group('play_path'),
+ 'page_url': url,
+ })
title = self._html_search_regex(
- r'class="evenement8">(.*?)</a>', webpage, 'title')
+ r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title')
+ duration = parse_duration(self._search_regex(
+ r'id=["\']video_duree["\'][^>]*>([^<]+)',
+ webpage, 'duration', fatal=False))
return {
'id': video_id,
- 'ext': 'mp4',
- 'url': video_url,
'title': title,
+ 'duration': duration,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 1b14471e5..004372f8d 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -25,14 +25,14 @@ class CanalplusIE(InfoExtractor):
}
_TESTS = [{
- 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
- 'md5': '3db39fb48b9685438ecf33a1078023e4',
+ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
+ 'md5': 'b3481d7ca972f61e37420798d0a9d934',
'info_dict': {
- 'id': '922470',
+ 'id': '1263092',
'ext': 'flv',
- 'title': 'Zapping - 26/08/13',
- 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
- 'upload_date': '20130826',
+ 'title': 'Le Zapping - 13/05/15',
+ 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
+ 'upload_date': '20150513',
},
}, {
'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
@@ -56,7 +56,7 @@ class CanalplusIE(InfoExtractor):
'skip': 'videos get deleted after a while',
}, {
'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
- 'md5': '65aa83ad62fe107ce29e564bb8712580',
+ 'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4',
'info_dict': {
'id': '1213714',
'ext': 'flv',
@@ -78,7 +78,8 @@ class CanalplusIE(InfoExtractor):
if video_id is None:
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
- r'<canal:player[^>]+?videoId="(\d+)"', webpage, 'video id')
+ [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)'],
+ webpage, 'video id', group='id')
info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
doc = self._download_xml(info_url, video_id, 'Downloading video XML')
@@ -106,15 +107,11 @@ class CanalplusIE(InfoExtractor):
continue
format_id = fmt.tag
if format_id == 'HLS':
- hls_formats = self._extract_m3u8_formats(format_url, video_id, 'flv')
- for fmt in hls_formats:
- fmt['preference'] = preference(format_id)
- formats.extend(hls_formats)
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', preference=preference(format_id)))
elif format_id == 'HDS':
- hds_formats = self._extract_f4m_formats(format_url + '?hdcore=2.11.3', video_id)
- for fmt in hds_formats:
- fmt['preference'] = preference(format_id)
- formats.extend(hds_formats)
+ formats.extend(self._extract_f4m_formats(
+ format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id)))
else:
formats.append({
'url': format_url,
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index e43756ec6..40d07ab18 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -1,17 +1,20 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import (
+ sanitized_Request,
+ smuggle_url,
+)
class CBSIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*'
+ _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
'id': '4JUVEwq3wUT7',
+ 'display_id': 'connect-chat-feat-garth-brooks',
'ext': 'flv',
'title': 'Connect Chat feat. Garth Brooks',
'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -26,6 +29,7 @@ class CBSIE(InfoExtractor):
'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
'info_dict': {
'id': 'WWF_5KqY3PK1',
+ 'display_id': 'st-vincent',
'ext': 'flv',
'title': 'Live on Letterman - St. Vincent',
'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
@@ -36,13 +40,29 @@ class CBSIE(InfoExtractor):
'skip_download': True,
},
'_skip': 'Blocked outside the US',
+ }, {
+ 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
+ display_id = self._match_id(url)
+ request = sanitized_Request(url)
+ # Android UA is served with higher quality (720p) streams (see
+ # https://github.com/rg3/youtube-dl/issues/7490)
+ request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)')
+ webpage = self._download_webpage(request, display_id)
real_id = self._search_regex(
- r"video\.settings\.pid\s*=\s*'([^']+)';",
+ [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
webpage, 'real video ID')
- return self.url_result('theplatform:%s' % real_id)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': smuggle_url(
+ 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id,
+ {'force_smil_url': True}),
+ 'display_id': display_id,
+ }
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 7e47960ab..f9a64a0a2 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor):
'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
'ext': 'flv',
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
- 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg',
+ 'thumbnail': 're:^https?://.*\.jpg$',
'duration': 205,
},
'params': {
@@ -67,9 +67,12 @@ class CBSNewsIE(InfoExtractor):
'format_id': format_id,
}
if uri.startswith('rtmp'):
+ play_path = re.sub(
+ r'{slistFilePath}', '',
+ uri.split('<break>')[-1].split('{break}')[-1])
fmt.update({
'app': 'ondemand?auth=cbs',
- 'play_path': 'mp4:' + uri.split('<break>')[-1],
+ 'play_path': 'mp4:' + play_path,
'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf',
'page_url': 'http://www.cbsnews.com',
'ext': 'flv',
diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py
new file mode 100644
index 000000000..ae47e74cc
--- /dev/null
+++ b/youtube_dl/extractor/cbssports.py
@@ -0,0 +1,30 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class CBSSportsIE(InfoExtractor):
+ _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s',
+ 'info_dict': {
+ 'id': '_d5_GbO8p1sT',
+ 'ext': 'flv',
+ 'title': 'US Open flashbacks: 1990s',
+ 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ section = mobj.group('section')
+ video_id = mobj.group('id')
+ all_videos = self._download_json(
+ 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section,
+ video_id)
+ # The json file contains the info of all the videos in the section
+ video_info = next(v for v in all_videos if v['pcid'] == video_id)
+ return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform')
diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py
index 2a5d4be18..6924eac70 100644
--- a/youtube_dl/extractor/ccc.py
+++ b/youtube_dl/extractor/ccc.py
@@ -16,7 +16,7 @@ class CCCIE(InfoExtractor):
_TEST = {
'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
- 'md5': '205a365d0d57c0b1e43a12c9ffe8f9be',
+ 'md5': '3a1eda8f3a29515d27f5adb967d7e740',
'info_dict': {
'id': '20131228183',
'ext': 'mp4',
@@ -51,7 +51,7 @@ class CCCIE(InfoExtractor):
matches = re.finditer(r'''(?xs)
<(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
- <a\s+href='(?P<http_url>[^']+)'>\s*
+ <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
(?:
.*?
<a\s+href='(?P<torrent_url>[^']+\.torrent)'
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index f70e090bb..6f7b2a70d 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -3,68 +3,95 @@ from __future__ import unicode_literals
import re
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..compat import (
- compat_urllib_request,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
from ..utils import (
ExtractorError,
float_or_none,
+ sanitized_Request,
)
-class CeskaTelevizeIE(SubtitlesInfoExtractor):
- _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
-
- _TESTS = [
- {
- 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
+class CeskaTelevizeIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P<id>[^/#?]+)/*(?:[#?].*)?$'
+ _TESTS = [{
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
+ 'info_dict': {
+ 'id': '61924494876951776',
+ 'ext': 'mp4',
+ 'title': 'Hyde Park Civilizace',
+ 'description': 'md5:fe93f6eda372d150759d11644ebbfb4a',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 3350,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+ 'info_dict': {
+ 'id': '61924494876844374',
+ 'ext': 'mp4',
+ 'title': 'První republika: Zpěvačka z Dupárny Bobina',
+ 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 88.4,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # video with 18+ caution trailer
+ 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
+ 'info_dict': {
+ 'id': '215562210900007-bogotart',
+ 'title': 'Queer: Bogotart',
+ 'description': 'Alternativní průvodce současným queer světem',
+ },
+ 'playlist': [{
'info_dict': {
- 'id': '214411058091220',
+ 'id': '61924494876844842',
'ext': 'mp4',
- 'title': 'Hyde Park Civilizace',
- 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře',
- 'thumbnail': 're:^https?://.*\.jpg',
- 'duration': 3350,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'title': 'Queer: Bogotart (Varování 18+)',
+ 'duration': 10.2,
},
- },
- {
- 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+ }, {
'info_dict': {
- 'id': '14716',
+ 'id': '61924494877068022',
'ext': 'mp4',
- 'title': 'První republika: Zpěvačka z Dupárny Bobina',
- 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.',
+ 'title': 'Queer: Bogotart (Queer)',
'thumbnail': 're:^https?://.*\.jpg',
- 'duration': 88.4,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'duration': 1558.3,
},
+ }],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
- ]
+ }]
def _real_extract(self, url):
url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ playlist_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, playlist_id)
NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
- typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
- episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
+ typ = self._html_search_regex(
+ r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
+ episode_id = self._html_search_regex(
+ r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
data = {
'playlist[0][type]': typ,
@@ -73,7 +100,7 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):
'requestSource': 'iVysilani',
}
- req = compat_urllib_request.Request(
+ req = sanitized_Request(
'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
data=compat_urllib_parse.urlencode(data))
@@ -82,54 +109,67 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):
req.add_header('X-Requested-With', 'XMLHttpRequest')
req.add_header('Referer', url)
- playlistpage = self._download_json(req, video_id)
+ playlistpage = self._download_json(req, playlist_id)
playlist_url = playlistpage['url']
if playlist_url == 'error_region':
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
- req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url))
+ req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
req.add_header('Referer', url)
- playlist = self._download_json(req, video_id)
-
- item = playlist['playlist'][0]
- formats = []
- for format_id, stream_url in item['streamUrls'].items():
- formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4'))
- self._sort_formats(formats)
-
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- duration = float_or_none(item.get('duration'))
- thumbnail = item.get('previewImageUrl')
-
- subtitles = {}
- subs = item.get('subtitles')
- if subs:
- subtitles['cs'] = subs[0]['url']
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
-
- subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles))
-
+ playlist_title = self._og_search_title(webpage)
+ playlist_description = self._og_search_description(webpage)
+
+ playlist = self._download_json(req, playlist_id)['playlist']
+ playlist_len = len(playlist)
+
+ entries = []
+ for item in playlist:
+ formats = []
+ for format_id, stream_url in item['streamUrls'].items():
+ formats.extend(self._extract_m3u8_formats(
+ stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native'))
+ self._sort_formats(formats)
+
+ item_id = item.get('id') or item['assetId']
+ title = item['title']
+
+ duration = float_or_none(item.get('duration'))
+ thumbnail = item.get('previewImageUrl')
+
+ subtitles = {}
+ if item.get('type') == 'VOD':
+ subs = item.get('subtitles')
+ if subs:
+ subtitles = self.extract_subtitles(episode_id, subs)
+
+ entries.append({
+ 'id': item_id,
+ 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title),
+ 'description': playlist_description if playlist_len == 1 else None,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+
+ return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+ def _get_subtitles(self, episode_id, subs):
+ original_subtitles = self._download_webpage(
+ subs[0]['url'], episode_id, 'Downloading subtitles')
+ srt_subs = self._fix_subtitles(original_subtitles)
return {
- 'id': episode_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- 'subtitles': subtitles,
+ 'cs': [{
+ 'ext': 'srt',
+ 'data': srt_subs,
+ }]
}
@staticmethod
def _fix_subtitles(subtitles):
""" Convert millisecond-based subtitles to SRT """
- if subtitles is None:
- return subtitles # subtitles not requested
def _msectotimecode(msec):
""" Helper utility to convert milliseconds to timecode """
@@ -149,7 +189,4 @@ class CeskaTelevizeIE(SubtitlesInfoExtractor):
else:
yield line
- fixed_subtitles = {}
- for k, v in subtitles.items():
- fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v))
- return fixed_subtitles
+ return "\r\n".join(_fix_subtitle(subtitles))
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index 3dfc24f5b..c74553dcf 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -3,7 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ parse_filesize,
+ qualities,
+)
class Channel9IE(InfoExtractor):
@@ -28,7 +32,7 @@ class Channel9IE(InfoExtractor):
'title': 'Developer Kick-Off Session: Stuff We Love',
'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
'duration': 4576,
- 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg',
+ 'thumbnail': 're:http://.*\.jpg',
'session_code': 'KOS002',
'session_day': 'Day 1',
'session_room': 'Arena 1A',
@@ -44,31 +48,29 @@ class Channel9IE(InfoExtractor):
'title': 'Self-service BI with Power BI - nuclear testing',
'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
'duration': 1540,
- 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg',
+ 'thumbnail': 're:http://.*\.jpg',
'authors': ['Mike Wilmot'],
},
+ },
+ {
+ # low quality mp4 is best
+ 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+ 'info_dict': {
+ 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+ 'ext': 'mp4',
+ 'title': 'Ranges for the Standard Library',
+ 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
+ 'duration': 5646,
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}
]
_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
- # Sorted by quality
- _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4']
-
- def _restore_bytes(self, formatted_size):
- if not formatted_size:
- return 0
- m = re.match(r'^(?P<size>\d+(?:\.\d+)?)\s+(?P<units>[a-zA-Z]+)', formatted_size)
- if not m:
- return 0
- units = m.group('units')
- try:
- exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper())
- except ValueError:
- return 0
- size = float(m.group('size'))
- return int(size * (1024 ** exponent))
-
def _formats_from_html(self, html):
FORMAT_REGEX = r'''
(?x)
@@ -78,16 +80,20 @@ class Channel9IE(InfoExtractor):
<h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
</div>)? # File size part may be missing
'''
- # Extract known formats
+ quality = qualities((
+ 'MP3', 'MP4',
+ 'Low Quality WMV', 'Low Quality MP4',
+ 'Mid Quality WMV', 'Mid Quality MP4',
+ 'High Quality WMV', 'High Quality MP4'))
formats = [{
'url': x.group('url'),
'format_id': x.group('quality'),
'format_note': x.group('note'),
'format': '%s (%s)' % (x.group('quality'), x.group('note')),
- 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate
- 'preference': self._known_formats.index(x.group('quality')),
+ 'filesize_approx': parse_filesize(x.group('filesize')),
+ 'quality': quality(x.group('quality')),
'vcodec': 'none' if x.group('note') == 'Audio only' else None,
- } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats]
+ } for x in list(re.finditer(FORMAT_REGEX, html))]
self._sort_formats(formats)
@@ -158,7 +164,7 @@ class Channel9IE(InfoExtractor):
def _extract_session_day(self, html):
m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
- return m.group('day') if m is not None else None
+ return m.group('day').strip() if m is not None else None
def _extract_session_room(self, html):
m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
@@ -224,12 +230,12 @@ class Channel9IE(InfoExtractor):
if contents is None:
return contents
- authors = self._extract_authors(html)
+ if len(contents) > 1:
+ raise ExtractorError('Got more than one entry')
+ result = contents[0]
+ result['authors'] = self._extract_authors(html)
- for content in contents:
- content['authors'] = authors
-
- return contents
+ return result
def _extract_session(self, html, content_path):
contents = self._extract_content(html, content_path)
diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py
new file mode 100644
index 000000000..0b67ba67d
--- /dev/null
+++ b/youtube_dl/extractor/chaturbate.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class ChaturbateIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://www.chaturbate.com/siswet19/',
+ 'info_dict': {
+ 'id': 'siswet19',
+ 'ext': 'mp4',
+ 'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'age_limit': 18,
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://en.chaturbate.com/siswet19/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ m3u8_url = self._search_regex(
+ r'src=(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage,
+ 'playlist', default=None, group='url')
+
+ if not m3u8_url:
+ error = self._search_regex(
+ r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>',
+ webpage, 'error', group='error')
+ raise ExtractorError(error, expected=True)
+
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'title': self._live_title(video_id),
+ 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % video_id,
+ 'age_limit': self._rta_search(webpage),
+ 'is_live': True,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py
index c922f6959..0206d96db 100644
--- a/youtube_dl/extractor/chilloutzone.py
+++ b/youtube_dl/extractor/chilloutzone.py
@@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor):
base64_video_info = self._html_search_regex(
r'var cozVidData = "(.+?)";', webpage, 'video data')
- decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8")
+ decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8')
video_info_dict = json.loads(decoded_video_info)
# get video information from dict
diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py
new file mode 100644
index 000000000..b1eeaf101
--- /dev/null
+++ b/youtube_dl/extractor/chirbit.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+)
+
+
+class ChirbitIE(InfoExtractor):
+ IE_NAME = 'chirbit'
+ _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'http://chirb.it/PrIPv5',
+ 'md5': '9847b0dad6ac3e074568bf2cfb197de8',
+ 'info_dict': {
+ 'id': 'PrIPv5',
+ 'ext': 'mp3',
+ 'title': 'Фасадстрой',
+ 'duration': 52,
+ 'view_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://chirb.it/%s' % audio_id, audio_id)
+
+ audio_url = self._search_regex(
+ r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url')
+
+ title = self._search_regex(
+ r'itemprop="name">([^<]+)', webpage, 'title')
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._search_regex(
+ r'itemprop="playCount"\s*>(\d+)', webpage,
+ 'listen count', fatal=False))
+ comment_count = int_or_none(self._search_regex(
+ r'>(\d+) Comments?:', webpage,
+ 'comment count', fatal=False))
+
+ return {
+ 'id': audio_id,
+ 'url': audio_url,
+ 'title': title,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ }
+
+
+class ChirbitProfileIE(InfoExtractor):
+ IE_NAME = 'chirbit:profile'
+ _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://chirbit.com/ScarletBeauty',
+ 'info_dict': {
+ 'id': 'ScarletBeauty',
+ 'title': 'Chirbits by ScarletBeauty',
+ },
+ 'playlist_mincount': 3,
+ }
+
+ def _real_extract(self, url):
+ profile_id = self._match_id(url)
+
+ rss = self._download_xml(
+ 'http://chirbit.com/rss/%s' % profile_id, profile_id)
+
+ entries = [
+ self.url_result(audio_url.text, 'Chirbit')
+ for audio_url in rss.findall('./channel/item/link')]
+
+ title = rss.find('./channel/title').text
+
+ return self.playlist_result(entries, profile_id, title)
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
new file mode 100644
index 000000000..fd1770dac
--- /dev/null
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -0,0 +1,111 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+from .bliptv import BlipTVIE
+from .screenwavemedia import ScreenwaveMediaIE
+
+
+class CinemassacreIE(InfoExtractor):
+ _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
+ _TESTS = [
+ {
+ 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+ 'md5': 'fde81fbafaee331785f58cd6c0d46190',
+ 'info_dict': {
+ 'id': 'Cinemassacre-19911',
+ 'ext': 'mp4',
+ 'upload_date': '20121110',
+ 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+ 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+ },
+ },
+ {
+ 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+ 'md5': 'd72f10cd39eac4215048f62ab477a511',
+ 'info_dict': {
+ 'id': 'Cinemassacre-521be8ef82b16',
+ 'ext': 'mp4',
+ 'upload_date': '20131002',
+ 'title': 'The Mummy’s Hand (1940)',
+ },
+ },
+ {
+ # blip.tv embedded video
+ 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
+ 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de',
+ 'info_dict': {
+ 'id': '4065369',
+ 'ext': 'flv',
+ 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
+ 'upload_date': '20061207',
+ 'uploader': 'cinemassacre',
+ 'uploader_id': '250778',
+ 'timestamp': 1283233867,
+ 'description': 'md5:0a108c78d130676b207d0f6d029ecffd',
+ }
+ },
+ {
+ # Youtube embedded video
+ 'url': 'http://cinemassacre.com/2006/09/01/mckids/',
+ 'md5': '6eb30961fa795fedc750eac4881ad2e1',
+ 'info_dict': {
+ 'id': 'FnxsNhuikpo',
+ 'ext': 'mp4',
+ 'upload_date': '20060901',
+ 'uploader': 'Cinemassacre Extras',
+ 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
+ 'uploader_id': 'Cinemassacre',
+ 'title': 'AVGN: McKids',
+ }
+ },
+ {
+ 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
+ 'md5': '1376908e49572389e7b06251a53cdd08',
+ 'info_dict': {
+ 'id': 'Cinemassacre-555779690c440',
+ 'ext': 'mp4',
+ 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
+ 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
+ 'upload_date': '20150525',
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+ video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
+
+ webpage = self._download_webpage(url, display_id)
+
+ playerdata_url = self._search_regex(
+ [
+ ScreenwaveMediaIE.EMBED_PATTERN,
+ r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
+ ],
+ webpage, 'player data URL', default=None, group='url')
+ if not playerdata_url:
+ playerdata_url = BlipTVIE._extract_url(webpage)
+ if not playerdata_url:
+ raise ExtractorError('Unable to find player data')
+
+ video_title = self._html_search_regex(
+ r'<title>(?P<title>.+?)\|', webpage, 'title')
+ video_description = self._html_search_regex(
+ r'<div class="entry-content">(?P<description>.+?)</div>',
+ webpage, 'description', flags=re.DOTALL, fatal=False)
+ video_thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': display_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'upload_date': video_date,
+ 'thumbnail': video_thumbnail,
+ 'url': playerdata_url,
+ }
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index a5c3cb7c6..7af903571 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -1,53 +1,68 @@
from __future__ import unicode_literals
import re
-import time
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
- parse_duration,
+ determine_ext,
+ int_or_none,
+ js_to_json,
+ parse_iso8601,
+ remove_end,
)
class ClipfishIE(InfoExtractor):
- IE_NAME = 'clipfish'
-
- _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
- 'md5': '2521cd644e862936cf2e698206e47385',
+ 'md5': '79bc922f3e8a9097b3d68a93780fd475',
'info_dict': {
'id': '3966754',
'ext': 'mp4',
'title': 'FIFA 14 - E3 2013 Trailer',
+ 'timestamp': 1370938118,
+ 'upload_date': '20130611',
'duration': 82,
- },
- 'skip': 'Blocked in the US'
+ }
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
-
- info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
- (video_id, int(time.time())))
- doc = self._download_xml(
- info_url, video_id, note='Downloading info page')
- title = doc.find('title').text
- video_url = doc.find('filename').text
- if video_url is None:
- xml_bytes = xml.etree.ElementTree.tostring(doc)
- raise ExtractorError('Cannot find video URL in document %r' %
- xml_bytes)
- thumbnail = doc.find('imageurl').text
- duration = parse_duration(doc.find('duration').text)
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_info = self._parse_json(
+ js_to_json(self._html_search_regex(
+ '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')),
+ video_id)
+
+ formats = []
+ for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage):
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.append({
+ 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'),
+ 'ext': 'mp4',
+ 'format_id': 'hls',
+ })
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': ext,
+ })
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' - Video')
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(video_info.get('length'))
+ timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date'))
return {
'id': video_id,
'title': title,
- 'url': video_url,
+ 'formats': formats,
'thumbnail': thumbnail,
'duration': duration,
+ 'timestamp': timestamp,
}
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index d07d544ea..8306d6fb7 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
find_xpath_attr,
@@ -10,9 +8,9 @@ from ..utils import (
class ClipsyndicateIE(InfoExtractor):
- _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+ _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
'md5': '4d7d549451bad625e0ff3d7bd56d776c',
'info_dict': {
@@ -22,11 +20,13 @@ class ClipsyndicateIE(InfoExtractor):
'duration': 612,
'thumbnail': 're:^https?://.+\.jpg',
},
- }
+ }, {
+ 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
js_player = self._download_webpage(
'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
video_id, 'Downlaoding player')
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index abf8cc280..0fa720ee8 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -105,6 +105,7 @@ class CloudyIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
file_key = self._search_regex(
- r'filekey\s*=\s*"([^"]+)"', webpage, 'file_key')
+ [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'],
+ webpage, 'file_key')
return self._extract_video(video_host, video_id, file_key)
diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py
index 14f215c5c..1dfa7c12e 100644
--- a/youtube_dl/extractor/clubic.py
+++ b/youtube_dl/extractor/clubic.py
@@ -12,9 +12,9 @@ from ..utils import (
class ClubicIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html'
+ _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html',
'md5': '1592b694ba586036efac1776b0b43cd3',
'info_dict': {
@@ -24,7 +24,10 @@ class ClubicIE(InfoExtractor):
'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*',
'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$',
}
- }
+ }, {
+ 'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py
new file mode 100644
index 000000000..57e643799
--- /dev/null
+++ b/youtube_dl/extractor/clyp.py
@@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ parse_iso8601,
+)
+
+
+class ClypIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)'
+ _TEST = {
+ 'url': 'https://clyp.it/ojz2wfah',
+ 'md5': '1d4961036c41247ecfdcc439c0cddcbb',
+ 'info_dict': {
+ 'id': 'ojz2wfah',
+ 'ext': 'mp3',
+ 'title': 'Krisson80 - bits wip wip',
+ 'description': '#Krisson80BitsWipWip #chiptune\n#wip',
+ 'duration': 263.21,
+ 'timestamp': 1443515251,
+ 'upload_date': '20150929',
+ },
+ }
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+
+ metadata = self._download_json(
+ 'https://api.clyp.it/%s' % audio_id, audio_id)
+
+ formats = []
+ for secure in ('', 'Secure'):
+ for ext in ('Ogg', 'Mp3'):
+ format_id = '%s%s' % (secure, ext)
+ format_url = metadata.get('%sUrl' % format_id)
+ if format_url:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'vcodec': 'none',
+ })
+ self._sort_formats(formats)
+
+ title = metadata['Title']
+ description = metadata.get('Description')
+ duration = float_or_none(metadata.get('Duration'))
+ timestamp = parse_iso8601(metadata.get('DateCreated'))
+
+ return {
+ 'id': audio_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py
index e96c59f71..f1311b14f 100644
--- a/youtube_dl/extractor/cmt.py
+++ b/youtube_dl/extractor/cmt.py
@@ -4,7 +4,7 @@ from .mtv import MTVIE
class CMTIE(MTVIE):
IE_NAME = 'cmt.com'
- _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml'
+ _VALID_URL = r'https?://www\.cmt\.com/(?:videos|shows)/(?:[^/]+/)*(?P<videoid>\d+)'
_FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/'
_TESTS = [{
@@ -16,4 +16,7 @@ class CMTIE(MTVIE):
'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
'description': 'Blame It All On My Roots',
},
+ }, {
+ 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172',
+ 'only_matching': True,
}]
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
index 3145b3051..5dd69bff7 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cnet.py
@@ -11,7 +11,7 @@ from ..utils import (
class CNETIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
'info_dict': {
'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
@@ -25,7 +25,20 @@ class CNETIE(InfoExtractor):
'params': {
'skip_download': 'requires rtmpdump',
}
- }
+ }, {
+ 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+ 'info_dict': {
+ 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba',
+ 'ext': 'flv',
+ 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole',
+ 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
+ 'uploader': 'Ashley Esqueda',
+ 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
@@ -42,7 +55,7 @@ class CNETIE(InfoExtractor):
raise ExtractorError('Cannot find video data')
mpx_account = data['config']['players']['default']['mpx_account']
- vid = vdata['files']['rtmp']
+ vid = vdata['files'].get('rtmp', vdata['files']['hds'])
tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid)
video_id = vdata['id']
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 90ea07438..3b1bd4033 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -12,7 +12,7 @@ from ..utils import (
class CNNIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
- (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))'''
+ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
_TESTS = [{
'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
@@ -45,6 +45,12 @@ class CNNIE(InfoExtractor):
'description': 'md5:e7223a503315c9f150acac52e76de086',
'upload_date': '20141222',
}
+ }, {
+ 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py
index fedd48490..40667a0f1 100644
--- a/youtube_dl/extractor/collegerama.py
+++ b/youtube_dl/extractor/collegerama.py
@@ -3,10 +3,10 @@ from __future__ import unicode_literals
import json
from .common import InfoExtractor
-from ..compat import compat_urllib_request
from ..utils import (
float_or_none,
int_or_none,
+ sanitized_Request,
)
@@ -52,7 +52,7 @@ class CollegeRamaIE(InfoExtractor):
}
}
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
'http://collegerama.tudelft.nl/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions',
json.dumps(player_options_request))
request.add_header('Content-Type', 'application/json')
diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py
index 9c25b2223..81f3d7697 100644
--- a/youtube_dl/extractor/comcarcoff.py
+++ b/youtube_dl/extractor/comcarcoff.py
@@ -36,7 +36,7 @@ class ComCarCoffIE(InfoExtractor):
webpage, 'full data json'))
video_id = full_data['activeVideo']['video']
- video_data = full_data['videos'][video_id]
+ video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id]
thumbnails = [{
'url': video_data['images']['thumb'],
}, {
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index b24538981..3e4bd10b6 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -151,12 +151,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
mobj = re.match(self._VALID_URL, url)
if mobj.group('shortname'):
- if mobj.group('shortname') in ('tds', 'thedailyshow'):
- url = 'http://thedailyshow.cc.com/full-episodes/'
- else:
- url = 'http://thecolbertreport.cc.com/full-episodes/'
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- assert mobj is not None
+ return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes')
if mobj.group('clip'):
if mobj.group('videotitle'):
@@ -201,7 +196,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
uri = mMovieParams[0][1]
# Correct cc.com in uri
- uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri)
+ uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri)
index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri}))
idoc = self._download_xml(
@@ -250,6 +245,8 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
})
self._sort_formats(formats)
+ subtitles = self._extract_subtitles(cdoc, guid)
+
virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1)
entries.append({
'id': guid,
@@ -260,6 +257,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
'duration': duration,
'thumbnail': thumbnail,
'description': description,
+ 'subtitles': subtitles,
})
return {
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 48742189a..eb9bfa3d1 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,30 +10,39 @@ import re
import socket
import sys
import time
-import xml.etree.ElementTree
from ..compat import (
compat_cookiejar,
- compat_HTTPError,
+ compat_cookies,
+ compat_getpass,
compat_http_client,
compat_urllib_error,
+ compat_urllib_parse,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
+ compat_etree_fromstring,
)
from ..utils import (
+ NO_DEFAULT,
age_restricted,
+ bug_reports_message,
clean_html,
compiled_regex_type,
+ determine_ext,
ExtractorError,
+ fix_xml_ampersands,
float_or_none,
- HEADRequest,
int_or_none,
RegexNotFoundError,
sanitize_filename,
+ sanitized_Request,
unescapeHTML,
+ unified_strdate,
+ url_basename,
+ xpath_text,
+ xpath_with_ns,
)
-_NO_DEFAULT = object()
class InfoExtractor(object):
@@ -47,7 +56,7 @@ class InfoExtractor(object):
information possibly downloading the video to the file system, among
other possible outcomes.
- The type field determines the the type of the result.
+ The type field determines the type of the result.
By far the most common value (and the default if _type is missing) is
"video", which indicates a single video.
@@ -63,7 +72,7 @@ class InfoExtractor(object):
Potential fields:
* url Mandatory. The URL of the video file
- * ext Will be calculated from url if missing
+ * ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
Calculated from the format_id, width, height.
@@ -111,11 +120,8 @@ class InfoExtractor(object):
(quality takes higher priority)
-1 for default (order by other properties),
-2 or smaller for less than default.
- * http_method HTTP method to use for the download.
* http_headers A dictionary of additional HTTP headers
to add to the request.
- * http_post_data Additional data to send with a POST
- request.
* stretched_ratio If given and not 1, indicates that the
video's pixels are not square.
width : height ratio as float.
@@ -146,17 +152,26 @@ class InfoExtractor(object):
description: Full video description.
uploader: Full name of the video uploader.
creator: The main artist who created the video.
+ release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
location: Physical location where the video was filmed.
- subtitles: The subtitle file contents as a dictionary in the format
- {language: subtitles}.
+ subtitles: The available subtitles as a dictionary in the format
+ {language: subformats}. "subformats" is a list sorted from
+ lower to higher preference, each element is a dictionary
+ with the "ext" entry and one of:
+ * "data": The subtitles file contents
+ * "url": A URL pointing to the subtitles file
+ "ext" will be calculated from URL if missing
+ automatic_captions: Like 'subtitles', used by the YoutubeIE for
+ automatically generated captions
duration: Length of the video in seconds, as an integer.
view_count: How many users have watched the video on the platform.
like_count: Number of positive ratings of the video
dislike_count: Number of negative ratings of the video
+ repost_count: Number of reposts of the video
average_rating: Average rating give by users, the scale used depends on the webpage
comment_count: Number of comments on the video
comments: A list of comments, each with one or more of the following
@@ -171,13 +186,18 @@ class InfoExtractor(object):
Set to "root" to indicate that this is a
comment to the original video.
age_limit: Age restriction for the video, as an integer (years)
- webpage_url: The url to the video webpage, if given to youtube-dl it
+ webpage_url: The URL to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
categories: A list of categories that the video falls in, for example
["Sports", "Berlin"]
+ tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
is_live: True, False, or None (=unknown). Whether this video is a
live stream that goes on instead of a fixed-length video.
+ start_time: Time in seconds where the reproduction should start, as
+ specified in the URL.
+ end_time: Time in seconds where the reproduction should end, as
+ specified in the URL.
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -188,8 +208,8 @@ class InfoExtractor(object):
There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification.
- Additionally, playlists can have "title" and "id" attributes with the same
- semantics as videos (see above).
+ Additionally, playlists can have "title", "description" and "id" attributes
+ with the same semantics as videos (see above).
_type "multi_video" indicates that there are multiple videos that
@@ -290,11 +310,11 @@ class InfoExtractor(object):
@classmethod
def ie_key(cls):
"""A string for getting the InfoExtractor with get_info_extractor"""
- return cls.__name__[:-2]
+ return compat_str(cls.__name__[:-2])
@property
def IE_NAME(self):
- return type(self).__name__[:-2]
+ return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the response handle """
@@ -319,7 +339,7 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
@@ -329,14 +349,11 @@ class InfoExtractor(object):
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
return (content, urlh)
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
- content_type = urlh.headers.get('Content-Type', '')
- webpage_bytes = urlh.read()
- if prefix is not None:
- webpage_bytes = prefix + webpage_bytes
+ @staticmethod
+ def _guess_encoding_from_content(content_type, webpage_bytes):
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m:
encoding = m.group(1)
@@ -349,6 +366,16 @@ class InfoExtractor(object):
encoding = 'utf-16'
else:
encoding = 'utf-8'
+
+ return encoding
+
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ content_type = urlh.headers.get('Content-Type', '')
+ webpage_bytes = urlh.read()
+ if prefix is not None:
+ webpage_bytes = prefix + webpage_bytes
+ if not encoding:
+ encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
if self._downloader.params.get('dump_intermediate_pages', False):
try:
url = url_or_request.get_full_url()
@@ -392,16 +419,26 @@ class InfoExtractor(object):
if blocked_iframe:
msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True)
+ if '<title>The URL you requested has been blocked</title>' in content[:512]:
+ msg = (
+ 'Access to this webpage has been blocked by Indian censorship. '
+ 'Use a VPN or proxy server (with --proxy) to route around it.')
+ block_msg = self._html_search_regex(
+ r'</h1><p>(.*?)</p>',
+ content, 'block message', default=None)
+ if block_msg:
+ msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+ raise ExtractorError(msg, expected=True)
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
""" Returns the data of the page as a string """
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
@@ -416,23 +453,24 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True):
+ transform_source=None, fatal=True, encoding=None):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal)
+ url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
if xml_string is False:
return xml_string
if transform_source:
xml_string = transform_source(xml_string)
- return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
errnote='Unable to download JSON metadata',
transform_source=None,
- fatal=True):
+ fatal=True, encoding=None):
json_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal)
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding)
if (not fatal) and json_string is False:
return None
return self._parse_json(
@@ -475,16 +513,30 @@ class InfoExtractor(object):
"""Report attempt to log in."""
self.to_screen('Logging in')
+ @staticmethod
+ def raise_login_required(msg='This video is only available for registered users'):
+ raise ExtractorError(
+ '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
+ expected=True)
+
+ @staticmethod
+ def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+ raise ExtractorError(
+ '%s. You might want to use --proxy to workaround.' % msg,
+ expected=True)
+
# Methods for following #608
@staticmethod
- def url_result(url, ie=None, video_id=None):
- """Returns a url that points to a page that should be processed"""
+ def url_result(url, ie=None, video_id=None, video_title=None):
+ """Returns a URL that points to a page that should be processed"""
# TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
if video_id is not None:
video_info['id'] = video_id
+ if video_title is not None:
+ video_info['title'] = video_title
return video_info
@staticmethod
@@ -500,7 +552,7 @@ class InfoExtractor(object):
video_info['description'] = playlist_description
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -526,16 +578,15 @@ class InfoExtractor(object):
return next(g for g in mobj.groups() if g is not None)
else:
return mobj.group(group)
- elif default is not _NO_DEFAULT:
+ elif default is not NO_DEFAULT:
return default
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning('unable to extract %s; '
- 'please report this issue on http://yt-dl.org/bug' % _name)
+ self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
@@ -547,7 +598,7 @@ class InfoExtractor(object):
def _get_login_info(self):
"""
- Get the the login info as (username, password)
+ Get the login info as (username, password)
It will look in the netrc file using the _NETRC_MACHINE value
If there's no info available, return (None, None)
"""
@@ -575,7 +626,7 @@ class InfoExtractor(object):
return (username, password)
- def _get_tfa_info(self):
+ def _get_tfa_info(self, note='two-factor verification code'):
"""
Get the two-factor authentication info
TODO - asking the user will be required for sms/phone verify
@@ -589,19 +640,26 @@ class InfoExtractor(object):
if downloader_params.get('twofactor', None) is not None:
return downloader_params['twofactor']
- return None
+ return compat_getpass('Type %s and press [Return]: ' % note)
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
- property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+ property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+ % {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
template % (property_re, content_re),
template % (content_re, property_re),
]
+ @staticmethod
+ def _meta_regex(prop):
+ return r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
@@ -611,7 +669,7 @@ class InfoExtractor(object):
return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
- return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+ return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
def _og_search_description(self, html, **kargs):
return self._og_search_property('description', html, fatal=False, **kargs)
@@ -632,9 +690,7 @@ class InfoExtractor(object):
if display_name is None:
display_name = name
return self._html_search_regex(
- r'''(?isx)<meta
- (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
- [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ self._meta_regex(name),
html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
@@ -665,7 +721,7 @@ class InfoExtractor(object):
return RATING_TABLE.get(rating.lower(), None)
def _family_friendly_search(self, html):
- # See http://schema.org/VideoObj
+ # See http://schema.org/VideoObject
family_friendly = self._html_search_meta('isFamilyFriendly', html)
if not family_friendly:
@@ -683,7 +739,29 @@ class InfoExtractor(object):
return self._html_search_meta('twitter:player', html,
'twitter card player')
- def _sort_formats(self, formats):
+ @staticmethod
+ def _hidden_inputs(html):
+ html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
+ hidden_inputs = {}
+ for input in re.findall(r'(?i)<input([^>]+)>', html):
+ if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
+ continue
+ name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+ if not name:
+ continue
+ value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
+ if not value:
+ continue
+ hidden_inputs[name.group('value')] = value.group('value')
+ return hidden_inputs
+
+ def _form_hidden_inputs(self, form_id, html):
+ form = self._search_regex(
+ r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+ html, '%s form' % form_id, group='form')
+ return self._hidden_inputs(form)
+
+ def _sort_formats(self, formats, field_preference=None):
if not formats:
raise ExtractorError('No video formats found')
@@ -693,6 +771,9 @@ class InfoExtractor(object):
if not f.get('ext') and 'url' in f:
f['ext'] = determine_ext(f['url'])
+ if isinstance(field_preference, (list, tuple)):
+ return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+
preference = f.get('preference')
if preference is None:
proto = f.get('protocol')
@@ -729,6 +810,7 @@ class InfoExtractor(object):
f.get('language_preference') if f.get('language_preference') is not None else -1,
f.get('quality') if f.get('quality') is not None else -1,
f.get('tbr') if f.get('tbr') is not None else -1,
+ f.get('filesize') if f.get('filesize') is not None else -1,
f.get('vbr') if f.get('vbr') is not None else -1,
f.get('height') if f.get('height') is not None else -1,
f.get('width') if f.get('width') is not None else -1,
@@ -736,10 +818,9 @@ class InfoExtractor(object):
f.get('abr') if f.get('abr') is not None else -1,
audio_ext_preference,
f.get('fps') if f.get('fps') is not None else -1,
- f.get('filesize') if f.get('filesize') is not None else -1,
f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
f.get('source_preference') if f.get('source_preference') is not None else -1,
- f.get('format_id'),
+ f.get('format_id') if f.get('format_id') is not None else '',
)
formats.sort(key=_formats_key)
@@ -752,15 +833,17 @@ class InfoExtractor(object):
formats)
def _is_valid_url(self, url, video_id, item='video'):
+ url = self._proto_relative_url(url, scheme='http:')
+ # For now assume non HTTP(S) URLs always valid
+ if not (url.startswith('http://') or url.startswith('https://')):
+ return True
try:
- self._request_webpage(
- HEADRequest(url), video_id,
- 'Checking %s URL' % item)
+ self._request_webpage(url, video_id, 'Checking %s URL' % item)
return True
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError):
- self.report_warning(
- '%s URL is invalid, skipping' % item, video_id)
+ if isinstance(e.cause, compat_urllib_error.URLError):
+ self.to_screen(
+ '%s: %s URL is invalid, skipping' % (video_id, item))
return False
raise
@@ -788,10 +871,19 @@ class InfoExtractor(object):
self.to_screen(msg)
time.sleep(timeout)
- def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
- 'Unable to download f4m manifest')
+ 'Unable to download f4m manifest',
+ # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+ # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+ transform_source=transform_source,
+ fatal=fatal)
+
+ if manifest is False:
+ return manifest
formats = []
manifest_version = '1.0'
@@ -799,13 +891,32 @@ class InfoExtractor(object):
if not media_nodes:
manifest_version = '2.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+ base_url = xpath_text(
+ manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+ 'base URL', default=None)
+ if base_url:
+ base_url = base_url.strip()
for i, media_el in enumerate(media_nodes):
if manifest_version == '2.0':
- manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
- + (media_el.attrib.get('href') or media_el.attrib.get('url')))
+ media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+ if not media_url:
+ continue
+ manifest_url = (
+ media_url if media_url.startswith('http://') or media_url.startswith('https://')
+ else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
+ # If media_url is itself a f4m manifest do the recursive extraction
+ # since bitrates in parent manifest (this one) and media_url manifest
+ # may differ leading to inability to resolve the format by requested
+ # bitrate in f4m downloader
+ if determine_ext(manifest_url) == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
+ manifest_url, video_id, preference, f4m_id, fatal=fatal)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ continue
tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
- 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
+ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
'url': manifest_url,
'ext': 'flv',
'tbr': tbr,
@@ -819,14 +930,15 @@ class InfoExtractor(object):
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
- m3u8_id=None):
+ m3u8_id=None, note=None, errnote=None,
+ fatal=True):
formats = [{
- 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
+ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
- 'preference': -1,
+ 'preference': preference - 1 if preference else -1,
'resolution': 'multiple',
'format_note': 'Quality selection URL',
}]
@@ -836,11 +948,17 @@ class InfoExtractor(object):
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- m3u8_doc = self._download_webpage(
+ res = self._download_webpage_handle(
m3u8_url, video_id,
- note='Downloading m3u8 information',
- errnote='Failed to download m3u8 information')
+ note=note or 'Downloading m3u8 information',
+ errnote=errnote or 'Failed to download m3u8 information',
+ fatal=fatal)
+ if res is False:
+ return res
+ m3u8_doc, urlh = res
+ m3u8_url = urlh.geturl()
last_info = None
+ last_media = None
kv_rex = re.compile(
r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
for line in m3u8_doc.splitlines():
@@ -851,6 +969,13 @@ class InfoExtractor(object):
if v.startswith('"'):
v = v[1:-1]
last_info[m.group('key')] = v
+ elif line.startswith('#EXT-X-MEDIA:'):
+ last_media = {}
+ for m in kv_rex.finditer(line):
+ v = m.group('val')
+ if v.startswith('"'):
+ v = v[1:-1]
+ last_media[m.group('key')] = v
elif line.startswith('#') or not line.strip():
continue
else:
@@ -858,8 +983,13 @@ class InfoExtractor(object):
formats.append({'url': format_url(line)})
continue
tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+ format_id = []
+ if m3u8_id:
+ format_id.append(m3u8_id)
+ last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+ format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
f = {
- 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
+ 'format_id': '-'.join(format_id),
'url': format_url(line.strip()),
'tbr': tbr,
'ext': ext,
@@ -879,57 +1009,246 @@ class InfoExtractor(object):
width_str, height_str = resolution.split('x')
f['width'] = int(width_str)
f['height'] = int(height_str)
+ if last_media is not None:
+ f['m3u8_media'] = last_media
+ last_media = None
formats.append(f)
last_info = {}
self._sort_formats(formats)
return formats
- # TODO: improve extraction
- def _extract_smil_formats(self, smil_url, video_id, fatal=True):
- smil = self._download_xml(
- smil_url, video_id, 'Downloading SMIL file',
- 'Unable to download SMIL file', fatal=fatal)
+ @staticmethod
+ def _xpath_ns(path, namespace=None):
+ if not namespace:
+ return path
+ out = []
+ for c in path.split('/'):
+ if not c or c == '.':
+ out.append(c)
+ else:
+ out.append('{%s}%s' % (namespace, c))
+ return '/'.join(out)
+
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal)
+
if smil is False:
assert not fatal
return []
- base = smil.find('./head/meta').get('base')
+ namespace = self._parse_smil_namespace(smil)
+
+ return self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+ def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ if smil is False:
+ return {}
+ return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+ def _download_smil(self, smil_url, video_id, fatal=True):
+ return self._download_xml(
+ smil_url, video_id, 'Downloading SMIL file',
+ 'Unable to download SMIL file', fatal=fatal)
+
+ def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+ namespace = self._parse_smil_namespace(smil)
+
+ formats = self._parse_smil_formats(
+ smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+ subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+ video_id = os.path.splitext(url_basename(smil_url))[0]
+ title = None
+ description = None
+ upload_date = None
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ name = meta.attrib.get('name')
+ content = meta.attrib.get('content')
+ if not name or not content:
+ continue
+ if not title and name == 'title':
+ title = content
+ elif not description and name in ('description', 'abstract'):
+ description = content
+ elif not upload_date and name == 'date':
+ upload_date = unified_strdate(content)
+
+ thumbnails = [{
+ 'id': image.get('type'),
+ 'url': image.get('src'),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
+
+ return {
+ 'id': video_id,
+ 'title': title or video_id,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+ def _parse_smil_namespace(self, smil):
+ return self._search_regex(
+ r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
+ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ base = smil_url
+ for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+ b = meta.get('base') or meta.get('httpBase')
+ if b:
+ base = b
+ break
formats = []
rtmp_count = 0
- for video in smil.findall('./body/switch/video'):
+ http_count = 0
+
+ videos = smil.findall(self._xpath_ns('.//video', namespace))
+ for video in videos:
src = video.get('src')
if not src:
continue
- bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+
+ bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ filesize = int_or_none(video.get('size') or video.get('fileSize'))
width = int_or_none(video.get('width'))
height = int_or_none(video.get('height'))
proto = video.get('proto')
- if not proto:
- if base:
- if base.startswith('rtmp'):
- proto = 'rtmp'
- elif base.startswith('http'):
- proto = 'http'
ext = video.get('ext')
- if proto == 'm3u8':
- formats.extend(self._extract_m3u8_formats(src, video_id, ext))
- elif proto == 'rtmp':
+ src_ext = determine_ext(src)
+ streamer = video.get('streamer') or base
+
+ if proto == 'rtmp' or streamer.startswith('rtmp'):
rtmp_count += 1
- streamer = video.get('streamer') or base
formats.append({
'url': streamer,
'play_path': src,
'ext': 'flv',
'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
'tbr': bitrate,
+ 'filesize': filesize,
+ 'width': width,
+ 'height': height,
+ })
+ if transform_rtmp_url:
+ streamer, src = transform_rtmp_url(streamer, src)
+ formats[-1].update({
+ 'url': streamer,
+ 'play_path': src,
+ })
+ continue
+
+ src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+
+ if proto == 'm3u8' or src_ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ continue
+
+ if src_ext == 'f4m':
+ f4m_url = src_url
+ if not f4m_params:
+ f4m_params = {
+ 'hdcore': '3.2.0',
+ 'plugin': 'flowplayer-3.2.0.1',
+ }
+ f4m_url += '&' if '?' in f4m_url else '?'
+ f4m_url += compat_urllib_parse.urlencode(f4m_params)
+ f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ continue
+
+ if src_url.startswith('http') and self._is_valid_url(src, video_id):
+ http_count += 1
+ formats.append({
+ 'url': src_url,
+ 'ext': ext or src_ext or 'flv',
+ 'format_id': 'http-%d' % (bitrate or http_count),
+ 'tbr': bitrate,
+ 'filesize': filesize,
'width': width,
'height': height,
})
+ continue
+
self._sort_formats(formats)
return formats
+ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+ subtitles = {}
+ for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+ src = textstream.get('src')
+ if not src:
+ continue
+ ext = textstream.get('ext') or determine_ext(src)
+ if not ext:
+ type_ = textstream.get('type')
+ SUBTITLES_TYPES = {
+ 'text/vtt': 'vtt',
+ 'text/srt': 'srt',
+ 'application/smptett+xml': 'tt',
+ }
+ if type_ in SUBTITLES_TYPES:
+ ext = SUBTITLES_TYPES[type_]
+ lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+ subtitles.setdefault(lang, []).append({
+ 'url': src,
+ 'ext': ext,
+ })
+ return subtitles
+
+ def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+ xspf = self._download_xml(
+ playlist_url, playlist_id, 'Downloading xpsf playlist',
+ 'Unable to download xspf manifest', fatal=fatal)
+ if xspf is False:
+ return []
+ return self._parse_xspf(xspf, playlist_id)
+
+ def _parse_xspf(self, playlist, playlist_id):
+ NS_MAP = {
+ 'xspf': 'http://xspf.org/ns/0/',
+ 's1': 'http://static.streamone.nl/player/ns/0',
+ }
+
+ entries = []
+ for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+ title = xpath_text(
+ track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+ description = xpath_text(
+ track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+ thumbnail = xpath_text(
+ track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+ duration = float_or_none(
+ xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+ formats = [{
+ 'url': location.text,
+ 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+ 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+ 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+ } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': playlist_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ })
+ return entries
+
def _live_title(self, name):
""" Generate the title for a live video """
now = datetime.datetime.now()
@@ -964,6 +1283,12 @@ class InfoExtractor(object):
None, '/', True, False, expire_time, '', None, None, None)
self._downloader.cookiejar.set_cookie(cookie)
+ def _get_cookies(self, url):
+ """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+ req = sanitized_Request(url)
+ self._downloader.cookiejar.add_cookie_header(req)
+ return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+
def get_testcases(self, include_onlymatching=False):
t = getattr(self, '_TEST', None)
if t:
@@ -993,11 +1318,46 @@ class InfoExtractor(object):
any_restricted = any_restricted or is_restricted
return not any_restricted
+ def extract_subtitles(self, *args, **kwargs):
+ if (self._downloader.params.get('writesubtitles', False) or
+ self._downloader.params.get('listsubtitles')):
+ return self._get_subtitles(*args, **kwargs)
+ return {}
+
+ def _get_subtitles(self, *args, **kwargs):
+ raise NotImplementedError("This method must be implemented by subclasses")
+
+ @staticmethod
+ def _merge_subtitle_items(subtitle_list1, subtitle_list2):
+ """ Merge subtitle items for one language. Items with duplicated URLs
+ will be dropped. """
+ list1_urls = set([item['url'] for item in subtitle_list1])
+ ret = list(subtitle_list1)
+ ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+ return ret
+
+ @classmethod
+ def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
+ """ Merge two subtitle dictionaries, language by language. """
+ ret = dict(subtitle_dict1)
+ for lang in subtitle_dict2:
+ ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
+ return ret
+
+ def extract_automatic_captions(self, *args, **kwargs):
+ if (self._downloader.params.get('writeautomaticsub', False) or
+ self._downloader.params.get('listsubtitles')):
+ return self._get_automatic_captions(*args, **kwargs)
+ return {}
+
+ def _get_automatic_captions(self, *args, **kwargs):
+ raise NotImplementedError("This method must be implemented by subclasses")
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
- They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
+ They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
Instances should define _SEARCH_KEY and _MAX_RESULTS.
"""
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index 3db4db4e4..6f92ae2ed 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..compat import (
@@ -12,6 +11,7 @@ from ..compat import (
)
from ..utils import (
orderedSet,
+ remove_end,
)
@@ -24,21 +24,33 @@ class CondeNastIE(InfoExtractor):
# The keys are the supported sites and the values are the name to be shown
# to the user and in the extractor description.
_SITES = {
- 'wired': 'WIRED',
+ 'allure': 'Allure',
+ 'architecturaldigest': 'Architectural Digest',
+ 'arstechnica': 'Ars Technica',
+ 'bonappetit': 'Bon Appétit',
+ 'brides': 'Brides',
+ 'cnevids': 'Condé Nast',
+ 'cntraveler': 'Condé Nast Traveler',
+ 'details': 'Details',
+ 'epicurious': 'Epicurious',
+ 'glamour': 'Glamour',
+ 'golfdigest': 'Golf Digest',
'gq': 'GQ',
+ 'newyorker': 'The New Yorker',
+ 'self': 'SELF',
+ 'teenvogue': 'Teen Vogue',
+ 'vanityfair': 'Vanity Fair',
'vogue': 'Vogue',
- 'glamour': 'Glamour',
+ 'wired': 'WIRED',
'wmagazine': 'W Magazine',
- 'vanityfair': 'Vanity Fair',
- 'cnevids': 'Condé Nast',
}
- _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'http://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
- EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed)/.+?' % '|'.join(_SITES.keys())
+ EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
- _TEST = {
+ _TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
'md5': '1921f713ed48aabd715691f774c451f7',
'info_dict': {
@@ -47,7 +59,16 @@ class CondeNastIE(InfoExtractor):
'title': '3D Printed Speakers Lit With LED',
'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
}
- }
+ }, {
+ # JS embed
+ 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',
+ 'md5': 'f1a6f9cafb7083bab74a710f65d08999',
+ 'info_dict': {
+ 'id': '55f9cf8b61646d1acf00000c',
+ 'ext': 'mp4',
+ 'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
+ }
+ }]
def _extract_series(self, url, webpage):
title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>',
@@ -86,8 +107,8 @@ class CondeNastIE(InfoExtractor):
info_url = base_info_url + data
info_page = self._download_webpage(info_url, video_id,
'Downloading video info')
- video_info = self._search_regex(r'var video = ({.+?});', info_page, 'video info')
- video_info = json.loads(video_info)
+ video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info')
+ video_info = self._parse_json(video_info, video_id)
formats = [{
'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']),
@@ -111,6 +132,13 @@ class CondeNastIE(InfoExtractor):
url_type = mobj.group('type')
item_id = mobj.group('id')
+ # Convert JS embed to regular embed
+ if url_type == 'embedjs':
+ parsed_url = compat_urlparse.urlparse(url)
+ url = compat_urlparse.urlunparse(parsed_url._replace(
+ path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
+ url_type = 'embed'
+
self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
webpage = self._download_webpage(url, item_id)
diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py
index cf763ee7e..94d03ce2a 100644
--- a/youtube_dl/extractor/cracked.py
+++ b/youtube_dl/extractor/cracked.py
@@ -11,39 +11,65 @@ from ..utils import (
class CrackedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html'
- _TEST = {
+ _TESTS = [{
+ 'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html',
+ 'md5': '89b90b9824e3806ca95072c4d78f13f7',
+ 'info_dict': {
+ 'id': '19070',
+ 'ext': 'mp4',
+ 'title': 'If Animal Actors Got E! True Hollywood Stories',
+ 'timestamp': 1404954000,
+ 'upload_date': '20140710',
+ }
+ }, {
+ # youtube embed
'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html',
- 'md5': '4b29a5eeec292cd5eca6388c7558db9e',
+ 'md5': 'ccd52866b50bde63a6ef3b35016ba8c7',
'info_dict': {
- 'id': '19006',
+ 'id': 'EjI00A3rZD0',
'ext': 'mp4',
- 'title': '4 Plot Holes You Didn\'t Notice in Your Favorite Movies',
- 'description': 'md5:3b909e752661db86007d10e5ec2df769',
- 'timestamp': 1405659600,
- 'upload_date': '20140718',
+ 'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take",
+ 'description': 'md5:c603708c718b796fe6079e2b3351ffc7',
+ 'upload_date': '20140725',
+ 'uploader_id': 'Cracked',
+ 'uploader': 'Cracked',
}
- }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ youtube_url = self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"',
+ webpage, 'youtube url', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube')
+
video_url = self._html_search_regex(
- [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'], webpage, 'video URL')
+ [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
+ webpage, 'video URL')
+
+ title = self._search_regex(
+ [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'],
+ webpage, 'title')
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
+ description = self._search_regex(
+ r'name="?(?:og:)?description"?\s+content="([^"]+)"',
+ webpage, 'description', default=None)
- timestamp = self._html_search_regex(r'<time datetime="([^"]+)"', webpage, 'upload date', fatal=False)
+ timestamp = self._html_search_regex(
+ r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)
if timestamp:
timestamp = parse_iso8601(timestamp[:-6])
view_count = str_to_int(self._html_search_regex(
- r'<span class="views" id="viewCounts">([\d,\.]+) Views</span>', webpage, 'view count', fatal=False))
+ r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>',
+ webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
- r'<span id="commentCounts">([\d,\.]+)</span>', webpage, 'comment count', fatal=False))
+ r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>',
+ webpage, 'comment count', fatal=False))
m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url)
if m:
diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py
index 4fb178165..dedb810a0 100644
--- a/youtube_dl/extractor/criterion.py
+++ b/youtube_dl/extractor/criterion.py
@@ -27,9 +27,7 @@ class CriterionIE(InfoExtractor):
final_url = self._search_regex(
r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url')
title = self._og_search_title(webpage)
- description = self._html_search_regex(
- r'<meta name="description" content="(.+?)" />',
- webpage, 'video description')
+ description = self._html_search_meta('description', webpage)
thumbnail = self._search_regex(
r'so.addVariable\("thumbnailURL", "(.+?)"\)\;',
webpage, 'thumbnail url')
diff --git a/youtube_dl/extractor/crooksandliars.py b/youtube_dl/extractor/crooksandliars.py
new file mode 100644
index 000000000..443eb7691
--- /dev/null
+++ b/youtube_dl/extractor/crooksandliars.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ qualities,
+)
+
+
+class CrooksAndLiarsIE(InfoExtractor):
+ _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)'
+ _TESTS = [{
+ 'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi',
+ 'info_dict': {
+ 'id': '8RUoRhRi',
+ 'ext': 'mp4',
+ 'title': 'Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!',
+ 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1428207000,
+ 'upload_date': '20150405',
+ 'uploader': 'Heather',
+ 'duration': 236,
+ }
+ }, {
+ 'url': 'http://embed.crooksandliars.com/v/MTE3MjUtMzQ2MzA',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://embed.crooksandliars.com/embed/%s' % video_id, video_id)
+
+ manifest = self._parse_json(
+ self._search_regex(
+ r'var\s+manifest\s*=\s*({.+?})\n', webpage, 'manifest JSON'),
+ video_id)
+
+ quality = qualities(('webm_low', 'mp4_low', 'webm_high', 'mp4_high'))
+
+ formats = [{
+ 'url': item['url'],
+ 'format_id': item['type'],
+ 'quality': quality(item['type']),
+ } for item in manifest['flavors'] if item['mime'].startswith('video/')]
+ self._sort_formats(formats)
+
+ return {
+ 'url': url,
+ 'id': video_id,
+ 'title': manifest['title'],
+ 'description': manifest.get('description'),
+ 'thumbnail': self._proto_relative_url(manifest.get('poster')),
+ 'timestamp': int_or_none(manifest.get('created')),
+ 'uploader': manifest.get('author'),
+ 'duration': int_or_none(manifest.get('duration')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 1680f532f..00d943f77 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -5,31 +5,85 @@ import re
import json
import base64
import zlib
-import xml.etree.ElementTree
from hashlib import sha1
from math import pow, sqrt, floor
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_request,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
bytes_to_intlist,
intlist_to_bytes,
+ int_or_none,
+ lowercase_escape,
+ remove_end,
+ sanitized_Request,
unified_strdate,
urlencode_postdata,
+ xpath_text,
)
from ..aes import (
aes_cbc_decrypt,
- inc,
)
-from .common import InfoExtractor
-class CrunchyrollIE(SubtitlesInfoExtractor):
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+class CrunchyrollBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'crunchyroll'
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ login_url = 'https://www.crunchyroll.com/?a=formhandler'
+ data = urlencode_postdata({
+ 'formname': 'RpcApiUser_Login',
+ 'name': username,
+ 'password': password,
+ })
+ login_request = sanitized_Request(login_url, data)
+ login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ self._download_webpage(login_request, None, False, 'Wrong login info')
+
+ def _real_initialize(self):
+ self._login()
+
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
+ request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
+ else sanitized_Request(url_or_request))
+ # Accept-Language must be set explicitly to accept any language to avoid issues
+ # similar to https://github.com/rg3/youtube-dl/issues/6797.
+ # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction
+ # should be imposed or not (from what I can see it just takes the first language
+ # ignoring the priority and requires it to correspond the IP). By the way this causes
+ # Crunchyroll to not work in georestriction cases in some browsers that don't place
+ # the locale lang first in header. However allowing any language seems to workaround the issue.
+ request.add_header('Accept-Language', '*')
+ return super(CrunchyrollBaseIE, self)._download_webpage(
+ request, video_id, note, errnote, fatal, tries, timeout, encoding)
+
+ @staticmethod
+ def _add_skip_wall(url):
+ parsed_url = compat_urlparse.urlparse(url)
+ qs = compat_urlparse.parse_qs(parsed_url.query)
+ # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message:
+ # > This content may be inappropriate for some people.
+ # > Are you sure you want to continue?
+ # since it's not disabled by default in crunchyroll account's settings.
+ # See https://github.com/rg3/youtube-dl/issues/7202.
+ qs['skip_wall'] = ['1']
+ return compat_urlparse.urlunparse(
+ parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
+
+class CrunchyrollIE(CrunchyrollBaseIE):
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
'info_dict': {
@@ -47,8 +101,27 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
'skip_download': True,
},
}, {
+ 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
+ 'info_dict': {
+ 'id': '589804',
+ 'ext': 'flv',
+ 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
+ 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Danny Choo Network',
+ 'upload_date': '20120213',
+ },
+ 'params': {
+ # rtmp
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
'only_matching': True,
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium available
+ 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617',
+ 'only_matching': True,
}]
_FORMAT_IDS = {
@@ -58,27 +131,9 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
'1080': ('80', '108'),
}
- def _login(self):
- (username, password) = self._get_login_info()
- if username is None:
- return
- self.report_login()
- login_url = 'https://www.crunchyroll.com/?a=formhandler'
- data = urlencode_postdata({
- 'formname': 'RpcApiUser_Login',
- 'name': username,
- 'password': password,
- })
- login_request = compat_urllib_request.Request(login_url, data)
- login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- self._download_webpage(login_request, None, False, 'Wrong login info')
-
- def _real_initialize(self):
- self._login()
-
def _decrypt_subtitles(self, data, iv, id):
- data = bytes_to_intlist(data)
- iv = bytes_to_intlist(iv)
+ data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
+ iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
id = int(id)
def obfuscate_key_aux(count, modulo, start):
@@ -102,13 +157,6 @@ class CrunchyrollIE(SubtitlesInfoExtractor):
key = obfuscate_key(id)
- class Counter:
- __value = iv
-
- def next_value(self):
- temp = self.__value
- self.__value = inc(self.__value)
- return temp
decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
return zlib.decompress(decrypted_data)
@@ -187,6 +235,34 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return output
+ def _extract_subtitles(self, subtitle):
+ sub_root = compat_etree_fromstring(subtitle)
+ return [{
+ 'ext': 'srt',
+ 'data': self._convert_subtitles_to_srt(sub_root),
+ }, {
+ 'ext': 'ass',
+ 'data': self._convert_subtitles_to_ass(sub_root),
+ }]
+
+ def _get_subtitles(self, video_id, webpage):
+ subtitles = {}
+ for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
+ sub_page = self._download_webpage(
+ 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
+ video_id, note='Downloading subtitles for ' + sub_name)
+ id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
+ iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
+ data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
+ if not id or not iv or not data:
+ continue
+ subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
+ lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
+ if not lang_code:
+ continue
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
+ return subtitles
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
@@ -197,8 +273,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
else:
webpage_url = 'http://www.' + mobj.group('url')
- webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
- note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='')
+ webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage')
+ note_m = self._html_search_regex(
+ r'<div class="showmedia-trailer-notice">(.+?)</div>',
+ webpage, 'trailer-notice', default='')
if note_m:
raise ExtractorError(note_m)
@@ -208,18 +286,29 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
if msg.get('type') == 'error':
raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
- video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
+ if 'To view this, please log in to verify you are 18 or older.' in webpage:
+ self.raise_login_required()
+
+ video_title = self._html_search_regex(
+ r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
+ webpage, 'video_title')
video_title = re.sub(r' {2,}', ' ', video_title)
- video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
- if not video_description:
- video_description = None
- video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL)
+ video_description = self._html_search_regex(
+ r'<script[^>]*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id,
+ webpage, 'description', default=None)
+ if video_description:
+ video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
+ video_upload_date = self._html_search_regex(
+ [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'],
+ webpage, 'video_upload_date', fatal=False, flags=re.DOTALL)
if video_upload_date:
video_upload_date = unified_strdate(video_upload_date)
- video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
+ video_uploader = self._html_search_regex(
+ r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage,
+ 'video_uploader', fatal=False)
- playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
- playerdata_req = compat_urllib_request.Request(playerdata_url)
+ playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
+ playerdata_req = sanitized_Request(playerdata_url)
playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
@@ -231,52 +320,46 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
stream_quality, stream_format = self._FORMAT_IDS[fmt]
video_format = fmt + 'p'
- streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
- # urlencode doesn't work!
- streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format
+ streamdata_req = sanitized_Request(
+ 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
+ % (stream_id, stream_format, stream_quality),
+ compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
streamdata = self._download_xml(
streamdata_req, video_id,
note='Downloading media info for %s' % video_format)
- video_url = streamdata.find('.//host').text
- video_play_path = streamdata.find('.//file').text
- formats.append({
+ stream_info = streamdata.find('./{default}preload/stream_info')
+ video_url = stream_info.find('./host').text
+ video_play_path = stream_info.find('./file').text
+ metadata = stream_info.find('./metadata')
+ format_info = {
+ 'format': video_format,
+ 'format_id': video_format,
+ 'height': int_or_none(xpath_text(metadata, './height')),
+ 'width': int_or_none(xpath_text(metadata, './width')),
+ }
+
+ if '.fplive.net/' in video_url:
+ video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+ parsed_video_url = compat_urlparse.urlparse(video_url)
+ direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+ netloc='v.lvlt.crcdn.net',
+ path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1])))
+ if self._is_valid_url(direct_video_url, video_id, video_format):
+ format_info.update({
+ 'url': direct_video_url,
+ })
+ formats.append(format_info)
+ continue
+
+ format_info.update({
'url': video_url,
'play_path': video_play_path,
'ext': 'flv',
- 'format': video_format,
- 'format_id': video_format,
})
+ formats.append(format_info)
- subtitles = {}
- sub_format = self._downloader.params.get('subtitlesformat', 'srt')
- for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
- sub_page = self._download_webpage(
- 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
- video_id, note='Downloading subtitles for ' + sub_name)
- id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
- iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
- data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
- if not id or not iv or not data:
- continue
- id = int(id)
- iv = base64.b64decode(iv)
- data = base64.b64decode(data)
-
- subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
- lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
- if not lang_code:
- continue
- sub_root = xml.etree.ElementTree.fromstring(subtitle)
- if sub_format == 'ass':
- subtitles[lang_code] = self._convert_subtitles_to_ass(sub_root)
- else:
- subtitles[lang_code] = self._convert_subtitles_to_srt(sub_root)
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
+ subtitles = self.extract_subtitles(video_id, webpage)
return {
'id': video_id,
@@ -290,9 +373,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
}
-class CrunchyrollShowPlaylistIE(InfoExtractor):
+class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
IE_NAME = "crunchyroll:playlist"
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
@@ -301,12 +384,25 @@ class CrunchyrollShowPlaylistIE(InfoExtractor):
'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
},
'playlist_count': 13,
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium available
+ 'url': 'http://www.crunchyroll.com/cosplay-complex-ova',
+ 'info_dict': {
+ 'id': 'cosplay-complex-ova',
+ 'title': 'Cosplay Complex OVA'
+ },
+ 'playlist_count': 3,
+ 'skip': 'Georestricted',
+ }, {
+ # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
+ 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
+ 'only_matching': True,
}]
def _real_extract(self, url):
show_id = self._match_id(url)
- webpage = self._download_webpage(url, show_id)
+ webpage = self._download_webpage(self._add_skip_wall(url), show_id)
title = self._html_search_regex(
r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
webpage, 'title')
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index 955119d40..7b685d157 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -7,7 +7,11 @@ from ..utils import (
int_or_none,
unescapeHTML,
find_xpath_attr,
+ smuggle_url,
+ determine_ext,
+ ExtractorError,
)
+from .senateisvp import SenateISVPIE
class CSpanIE(InfoExtractor):
@@ -15,75 +19,115 @@ class CSpanIE(InfoExtractor):
IE_DESC = 'C-SPAN'
_TESTS = [{
'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
- 'md5': '8e44ce11f0f725527daccc453f553eb0',
+ 'md5': '94b29a4f131ff03d23471dd6f60b6a1d',
'info_dict': {
'id': '315139',
'ext': 'mp4',
'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
- 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
+ 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.',
},
'skip': 'Regularly fails on travis, for unknown reasons',
}, {
'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
- # For whatever reason, the served video alternates between
- # two different ones
+ 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b',
'info_dict': {
- 'id': '340723',
+ 'id': 'c4486943',
'ext': 'mp4',
- 'title': 'International Health Care Models',
+ 'title': 'CSPAN - International Health Care Models',
'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
}
}, {
'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
+ 'md5': '2ae5051559169baadba13fc35345ae74',
'info_dict': {
'id': '342759',
+ 'ext': 'mp4',
'title': 'General Motors Ignition Switch Recall',
+ 'duration': 14848,
+ 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3'
},
- 'playlist_duration_sum': 14855,
+ }, {
+ # Video from senate.gov
+ 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
+ 'info_dict': {
+ 'id': 'judiciary031715',
+ 'ext': 'flv',
+ 'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
+ }
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('id')
- webpage = self._download_webpage(url, page_id)
- video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id')
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage)
+ if matches:
+ video_type, video_id = matches.groups()
+ if video_type == 'prog':
+ video_type = 'program'
+ else:
+ senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+ if senate_isvp_url:
+ title = self._og_search_title(webpage)
+ surl = smuggle_url(senate_isvp_url, {'force_title': title})
+ return self.url_result(surl, 'SenateISVP', video_id, title)
- description = self._html_search_regex(
- [
- # The full description
- r'<div class=\'expandable\'>(.*?)<a href=\'#\'',
- # If the description is small enough the other div is not
- # present, otherwise this is a stripped version
- r'<p class=\'initial\'>(.*?)</p>'
- ],
- webpage, 'description', flags=re.DOTALL)
+ def get_text_attr(d, attr):
+ return d.get(attr, {}).get('#text')
- info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
- data = self._download_json(info_url, video_id)
+ data = self._download_json(
+ 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),
+ video_id)['video']
+ if data['@status'] != 'Success':
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)
doc = self._download_xml(
- 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
+ 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),
video_id)
+ description = self._html_search_meta('description', webpage)
+
title = find_xpath_attr(doc, './/string', 'name', 'title').text
thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
- files = data['video']['files']
+ files = data['files']
+ capfile = get_text_attr(data, 'capfile')
- entries = [{
- 'id': '%s_%d' % (video_id, partnum + 1),
- 'title': (
- title if len(files) == 1 else
- '%s part %d' % (title, partnum + 1)),
- 'url': unescapeHTML(f['path']['#text']),
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': int_or_none(f.get('length', {}).get('#text')),
- } for partnum, f in enumerate(files)]
+ entries = []
+ for partnum, f in enumerate(files):
+ formats = []
+ for quality in f['qualities']:
+ formats.append({
+ 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),
+ 'url': unescapeHTML(get_text_attr(quality, 'file')),
+ 'height': int_or_none(get_text_attr(quality, 'height')),
+ 'tbr': int_or_none(get_text_attr(quality, 'bitrate')),
+ })
+ self._sort_formats(formats)
+ entries.append({
+ 'id': '%s_%d' % (video_id, partnum + 1),
+ 'title': (
+ title if len(files) == 1 else
+ '%s part %d' % (title, partnum + 1)),
+ 'formats': formats,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': int_or_none(get_text_attr(f, 'length')),
+ 'subtitles': {
+ 'en': [{
+ 'url': capfile,
+ 'ext': determine_ext(capfile, 'dfxp')
+ }],
+ } if capfile else None,
+ })
- return {
- '_type': 'playlist',
- 'entries': entries,
- 'title': title,
- 'id': video_id,
- }
+ if len(entries) == 1:
+ entry = dict(entries[0])
+ entry['id'] = 'c' + video_id if video_type == 'clip' else video_id
+ return entry
+ else:
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': title,
+ 'id': 'c' + video_id if video_type == 'clip' else video_id,
+ }
diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py
index 0226f8036..45049bf37 100644
--- a/youtube_dl/extractor/ctsnews.py
+++ b/youtube_dl/extractor/ctsnews.py
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601, ExtractorError
class CtsNewsIE(InfoExtractor):
+ IE_DESC = '華視新聞'
# https connection failed (Connection reset)
_VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
_TESTS = [{
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index cf5841a7c..ab7f3aec4 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -6,16 +6,14 @@ import json
import itertools
from .common import InfoExtractor
-from .subtitles import SubtitlesInfoExtractor
-from ..compat import (
- compat_str,
- compat_urllib_request,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
+ determine_ext,
int_or_none,
- orderedSet,
+ parse_iso8601,
+ sanitized_Request,
str_to_int,
unescapeHTML,
)
@@ -25,15 +23,20 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
@staticmethod
def _build_request(url):
"""Build a request with the family filter disabled"""
- request = compat_urllib_request.Request(url)
- request.add_header('Cookie', 'family_filter=off')
- request.add_header('Cookie', 'ff=off')
+ request = sanitized_Request(url)
+ request.add_header('Cookie', 'family_filter=off; ff=off')
return request
+ def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
+ request = self._build_request(url)
+ return self._download_webpage_handle(request, *args, **kwargs)
+
+ def _download_webpage_no_ff(self, url, *args, **kwargs):
+ request = self._build_request(url)
+ return self._download_webpage(request, *args, **kwargs)
-class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
- """Information Extractor for Dailymotion"""
+class DailymotionIE(DailymotionBaseInfoExtractor):
_VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)'
IE_NAME = 'dailymotion'
@@ -47,13 +50,22 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
_TESTS = [
{
- 'url': 'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
- 'md5': '392c4b85a60a90dc4792da41ce3144eb',
+ 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
+ 'md5': '2137c41a8e78554bb09225b8eb322406',
'info_dict': {
- 'id': 'x33vw9',
+ 'id': 'x2iuewm',
'ext': 'mp4',
- 'uploader': 'Amphora Alex and Van .',
- 'title': 'Tutoriel de Youtubeur"DL DES VIDEO DE YOUTUBE"',
+ 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+ 'description': 'Several come bundled with the Steam Controller.',
+ 'thumbnail': 're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 74,
+ 'timestamp': 1425657362,
+ 'upload_date': '20150306',
+ 'uploader': 'IGN',
+ 'uploader_id': 'xijv66',
+ 'age_limit': 0,
+ 'view_count': int,
+ 'comment_count': int,
}
},
# Vevo video
@@ -82,45 +94,128 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader': 'HotWaves1012',
'age_limit': 18,
}
+ },
+ # geo-restricted, player v5
+ {
+ 'url': 'http://www.dailymotion.com/video/xhza0o',
+ 'only_matching': True,
}
]
def _real_extract(self, url):
video_id = self._match_id(url)
- url = 'http://www.dailymotion.com/video/%s' % video_id
- # Retrieve video webpage to extract further information
- request = self._build_request(url)
- webpage = self._download_webpage(request, video_id)
+ webpage = self._download_webpage_no_ff(
+ 'https://www.dailymotion.com/video/%s' % video_id, video_id)
+
+ age_limit = self._rta_search(webpage)
- # Extract URL, uploader and title from webpage
- self.report_extraction(video_id)
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage, 'description')
+
+ view_count = str_to_int(self._search_regex(
+ [r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"',
+ r'video_views_count[^>]+>\s+([\d\.,]+)'],
+ webpage, 'view count', fatal=False))
+ comment_count = int_or_none(self._search_regex(
+ r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
+ webpage, 'comment count', fatal=False))
+
+ player_v5 = self._search_regex(
+ [r'buildPlayer\(({.+?})\);', r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);'],
+ webpage, 'player v5', default=None)
+ if player_v5:
+ player = self._parse_json(player_v5, video_id)
+ metadata = player['metadata']
+
+ self._check_error(metadata)
+
+ formats = []
+ for quality, media_list in metadata['qualities'].items():
+ for media in media_list:
+ media_url = media.get('url')
+ if not media_url:
+ continue
+ type_ = media.get('type')
+ if type_ == 'application/vnd.lumberjack.manifest':
+ continue
+ ext = determine_ext(media_url)
+ if type_ == 'application/x-mpegURL' or ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ elif type_ == 'application/f4m' or ext == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
+ media_url, video_id, preference=-1, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ else:
+ f = {
+ 'url': media_url,
+ 'format_id': quality,
+ }
+ m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
+ if m:
+ f.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ title = metadata['title']
+ duration = int_or_none(metadata.get('duration'))
+ timestamp = int_or_none(metadata.get('created_time'))
+ thumbnail = metadata.get('poster_url')
+ uploader = metadata.get('owner', {}).get('screenname')
+ uploader_id = metadata.get('owner', {}).get('id')
+
+ subtitles = {}
+ for subtitle_lang, subtitle in metadata.get('subtitles', {}).get('data', {}).items():
+ subtitles[subtitle_lang] = [{
+ 'ext': determine_ext(subtitle_url),
+ 'url': subtitle_url,
+ } for subtitle_url in subtitle.get('urls', [])]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'age_limit': age_limit,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
- # It may just embed a vevo video:
- m_vevo = re.search(
+ # vevo embed
+ vevo_id = self._search_regex(
r'<link rel="video_src" href="[^"]*?vevo.com[^"]*?video=(?P<id>[\w]*)',
- webpage)
- if m_vevo is not None:
- vevo_id = m_vevo.group('id')
- self.to_screen('Vevo video detected: %s' % vevo_id)
- return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+ webpage, 'vevo embed', default=None)
+ if vevo_id:
+ return self.url_result('vevo:%s' % vevo_id, 'Vevo')
- age_limit = self._rta_search(webpage)
+ # fallback old player
+ embed_page = self._download_webpage_no_ff(
+ 'https://www.dailymotion.com/embed/video/%s' % video_id,
+ video_id, 'Downloading embed page')
- video_upload_date = None
- mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
- if mobj is not None:
- video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
-
- embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
- embed_page = self._download_webpage(embed_url, video_id,
- 'Downloading embed page')
- info = self._search_regex(r'var info = ({.*?}),$', embed_page,
- 'video info', flags=re.MULTILINE)
- info = json.loads(info)
- if info.get('error') is not None:
- msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
- raise ExtractorError(msg, expected=True)
+ timestamp = parse_iso8601(self._html_search_meta(
+ 'video:release_date', webpage, 'upload date'))
+
+ info = self._parse_json(
+ self._search_regex(
+ r'var info = ({.*?}),$', embed_page,
+ 'video info', flags=re.MULTILINE),
+ video_id)
+
+ self._check_error(info)
formats = []
for (key, format_id) in self._FORMATS:
@@ -138,18 +233,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
'width': width,
'height': height,
})
- if not formats:
- raise ExtractorError('Unable to extract video URL')
+ self._sort_formats(formats)
# subtitles
video_subtitles = self.extract_subtitles(video_id, webpage)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, webpage)
- return
-
- view_count = str_to_int(self._search_regex(
- r'video_views_count[^>]+>\s+([\d\.,]+)',
- webpage, 'view count', fatal=False))
title = self._og_search_title(webpage, default=None)
if title is None:
@@ -161,15 +248,22 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
'id': video_id,
'formats': formats,
'uploader': info['owner.screenname'],
- 'upload_date': video_upload_date,
+ 'timestamp': timestamp,
'title': title,
+ 'description': description,
'subtitles': video_subtitles,
'thumbnail': info['thumbnail_url'],
'age_limit': age_limit,
'view_count': view_count,
+ 'duration': info['duration']
}
- def _get_available_subtitles(self, video_id, webpage):
+ def _check_error(self, info):
+ if info.get('error') is not None:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True)
+
+ def _get_subtitles(self, video_id, webpage):
try:
sub_list = self._download_webpage(
'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
@@ -179,7 +273,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
return {}
info = json.loads(sub_list)
if (info['total'] > 0):
- sub_lang_list = dict((l['language'], l['url']) for l in info['list'])
+ sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])
return sub_lang_list
self._downloader.report_warning('video doesn\'t have subtitles')
return {}
@@ -194,23 +288,32 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
'info_dict': {
'title': 'SPORT',
+ 'id': 'xv4bw_nqtv_sport',
},
'playlist_mincount': 20,
}]
def _extract_entries(self, id):
- video_ids = []
+ video_ids = set()
+ processed_urls = set()
for pagenum in itertools.count(1):
- request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum))
- webpage = self._download_webpage(request,
- id, 'Downloading page %s' % pagenum)
+ page_url = self._PAGE_TEMPLATE % (id, pagenum)
+ webpage, urlh = self._download_webpage_handle_no_ff(
+ page_url, id, 'Downloading page %s' % pagenum)
+ if urlh.geturl() in processed_urls:
+ self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
+ page_url, urlh.geturl()), id)
+ break
- video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
+ processed_urls.add(urlh.geturl())
+
+ for video_id in re.findall(r'data-xid="(.+?)"', webpage):
+ if video_id not in video_ids:
+ yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+ video_ids.add(video_id)
if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break
- return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
- for video_id in orderedSet(video_ids)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -227,7 +330,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = 'dailymotion:user'
- _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
@@ -236,12 +339,24 @@ class DailymotionUserIE(DailymotionPlaylistIE):
'title': 'Rémi Gaillard',
},
'playlist_mincount': 100,
+ }, {
+ 'url': 'http://www.dailymotion.com/user/UnderProject',
+ 'info_dict': {
+ 'id': 'UnderProject',
+ 'title': 'UnderProject',
+ },
+ 'playlist_mincount': 1800,
+ 'expected_warnings': [
+ 'Stopped at duplicated page',
+ ],
+ 'skip': 'Takes too long time',
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')
- webpage = self._download_webpage(url, user)
+ webpage = self._download_webpage(
+ 'https://www.dailymotion.com/user/%s' % user, user)
full_user = unescapeHTML(self._html_search_regex(
r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
webpage, 'user'))
@@ -252,3 +367,52 @@ class DailymotionUserIE(DailymotionPlaylistIE):
'title': full_user,
'entries': self._extract_entries(user),
}
+
+
+class DailymotionCloudIE(DailymotionBaseInfoExtractor):
+ _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/'
+ _VALID_URL = r'%s[^/]+/(?P<id>[^/?]+)' % _VALID_URL_PREFIX
+ _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX
+
+ _TESTS = [{
+ # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html
+ # Tested at FranceTvInfo_2
+ 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1',
+ 'only_matching': True,
+ }, {
+ # http://www.francetvinfo.fr/societe/larguez-les-amarres-le-cobaturage-se-developpe_980101.html
+ 'url': 'http://api.dmcloud.net/player/embed/4e7343f894a6f677b10006b4/559545469473996d31429f06?auth=1467430263-0-90tglw2l-a3a4b64ed41efe48d7fccad85b8b8fda&autoplay=1',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def _extract_dmcloud_url(self, webpage):
+ mobj = re.search(r'<iframe[^>]+src=[\'"](%s)[\'"]' % self._VALID_EMBED_URL, webpage)
+ if mobj:
+ return mobj.group(1)
+
+ mobj = re.search(
+ r'<input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](%s)[\'"]' % self._VALID_EMBED_URL,
+ webpage)
+ if mobj:
+ return mobj.group(1)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage_no_ff(url, video_id)
+
+ title = self._html_search_regex(r'<title>([^>]+)</title>', webpage, 'title')
+
+ video_info = self._parse_json(self._search_regex(
+ r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id)
+
+ # TODO: parse ios_url, which is in fact a manifest
+ video_url = video_info['mp4_url']
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': video_info.get('thumbnail_url'),
+ }
diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py
index 212217625..133cdc50b 100644
--- a/youtube_dl/extractor/dbtv.py
+++ b/youtube_dl/extractor/dbtv.py
@@ -13,8 +13,8 @@ from ..utils import (
class DBTVIE(InfoExtractor):
- _VALID_URL = r'http://dbtv\.no/(?P<id>[0-9]+)#(?P<display_id>.+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:(?:lazyplayer|player)/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?'
+ _TESTS = [{
'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen',
'md5': 'b89953ed25dacb6edb3ef6c6f430f8bc',
'info_dict': {
@@ -30,12 +30,18 @@ class DBTVIE(InfoExtractor):
'view_count': int,
'categories': list,
}
- }
+ }, {
+ 'url': 'http://dbtv.no/3649835190001',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.dbtv.no/lazyplayer/4631135248001',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- display_id = mobj.group('display_id')
+ display_id = mobj.group('display_id') or video_id
data = self._download_json(
'http://api.dbtv.no/discovery/%s' % video_id, display_id)
diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py
new file mode 100644
index 000000000..9737cff14
--- /dev/null
+++ b/youtube_dl/extractor/dcn.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ sanitized_Request,
+)
+
+
+class DCNIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887',
+ 'info_dict':
+ {
+ 'id': '17375',
+ 'ext': 'mp4',
+ 'title': 'رحلة العمر : الحلقة 1',
+ 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 2041,
+ 'timestamp': 1227504126,
+ 'upload_date': '20081124',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ request = sanitized_Request(
+ 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,
+ headers={'Origin': 'http://www.dcndigital.ae'})
+
+ video = self._download_json(request, video_id)
+ title = video.get('title_en') or video['title_ar']
+
+ webpage = self._download_webpage(
+ 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' +
+ compat_urllib_parse.urlencode({
+ 'id': video['id'],
+ 'user_id': video['user_id'],
+ 'signature': video['signature'],
+ 'countries': 'Q0M=',
+ 'filter': 'DENY',
+ }), video_id)
+
+ m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+
+ rtsp_url = self._search_regex(
+ r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
+ if rtsp_url:
+ formats.append({
+ 'url': rtsp_url,
+ 'format_id': 'rtsp',
+ })
+
+ self._sort_formats(formats)
+
+ img = video.get('img')
+ thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None
+ duration = int_or_none(video.get('duration'))
+ description = video.get('description_en') or video.get('description_ar')
+ timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py
index 2b90bf4fc..98e3aedfd 100644
--- a/youtube_dl/extractor/defense.py
+++ b/youtube_dl/extractor/defense.py
@@ -25,8 +25,9 @@ class DefenseGouvFrIE(InfoExtractor):
r"flashvars.pvg_id=\"(\d+)\";",
webpage, 'ID')
- json_url = ('http://static.videos.gouv.fr/brightcovehub/export/json/'
- + video_id)
+ json_url = (
+ 'http://static.videos.gouv.fr/brightcovehub/export/json/%s' %
+ video_id)
info = self._download_json(json_url, title, 'Downloading JSON config')
video_url = info['renditions'][0]['url']
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py
new file mode 100644
index 000000000..6cd395e11
--- /dev/null
+++ b/youtube_dl/extractor/democracynow.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import os.path
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ url_basename,
+ remove_start,
+)
+
+
+class DemocracynowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?democracynow.org/(?P<id>[^\?]*)'
+ IE_NAME = 'democracynow'
+ _TESTS = [{
+ 'url': 'http://www.democracynow.org/shows/2015/7/3',
+ 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d',
+ 'info_dict': {
+ 'id': '2015-0703-001',
+ 'ext': 'mp4',
+ 'title': 'July 03, 2015 - Democracy Now!',
+ 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs',
+ },
+ }, {
+ 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
+ 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d',
+ 'info_dict': {
+ 'id': '2015-0703-001',
+ 'ext': 'mp4',
+ 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
+ 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ description = self._og_search_description(webpage)
+
+ json_data = self._parse_json(self._search_regex(
+ r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'),
+ display_id)
+ video_id = None
+ formats = []
+
+ default_lang = 'en'
+
+ subtitles = {}
+
+ def add_subtitle_item(lang, info_dict):
+ if lang not in subtitles:
+ subtitles[lang] = []
+ subtitles[lang].append(info_dict)
+
+ # chapter_file are not subtitles
+ if 'caption_file' in json_data:
+ add_subtitle_item(default_lang, {
+ 'url': compat_urlparse.urljoin(url, json_data['caption_file']),
+ })
+
+ for subtitle_item in json_data.get('captions', []):
+ lang = subtitle_item.get('language', '').lower() or default_lang
+ add_subtitle_item(lang, {
+ 'url': compat_urlparse.urljoin(url, subtitle_item['url']),
+ })
+
+ for key in ('file', 'audio', 'video'):
+ media_url = json_data.get(key, '')
+ if not media_url:
+ continue
+ media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url))
+ video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
+ formats.append({
+ 'url': media_url,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id or display_id,
+ 'title': json_data['title'],
+ 'description': description,
+ 'subtitles': subtitles,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py
index 8049779b0..263532cc6 100644
--- a/youtube_dl/extractor/dfb.py
+++ b/youtube_dl/extractor/dfb.py
@@ -3,42 +3,47 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import unified_strdate
class DFBIE(InfoExtractor):
IE_NAME = 'tv.dfb.de'
- _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)'
+ _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)'
_TEST = {
- 'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/',
+ 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
# The md5 is different each time
'info_dict': {
- 'id': '9070',
+ 'id': '11633',
+ 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
'ext': 'flv',
- 'title': 'Highlights des Empfangs in Berlin',
- 'upload_date': '20140716',
+ 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
+ 'upload_date': '20150714',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, display_id)
player_info = self._download_xml(
'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
- video_id)
+ display_id)
video_info = player_info.find('video')
- f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id)
+ f4m_info = self._download_xml(
+ self._proto_relative_url(video_info.find('url').text.strip()), display_id)
token_el = f4m_info.find('token')
manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
+ formats = self._extract_f4m_formats(manifest_url, display_id)
return {
'id': video_id,
+ 'display_id': display_id,
'title': video_info.find('title').text,
- 'url': manifest_url,
- 'ext': 'flv',
'thumbnail': self._og_search_thumbnail(webpage),
- 'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]),
+ 'upload_date': unified_strdate(video_info.find('time_date').text),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py
new file mode 100644
index 000000000..44e0c5d4d
--- /dev/null
+++ b/youtube_dl/extractor/dhm.py
@@ -0,0 +1,59 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class DHMIE(InfoExtractor):
+ IE_DESC = 'Filmarchiv - Deutsches Historisches Museum'
+ _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)'
+
+ _TESTS = [{
+ 'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/',
+ 'md5': '11c475f670209bf6acca0b2b7ef51827',
+ 'info_dict': {
+ 'id': 'the-marshallplan-at-work-in-west-germany',
+ 'ext': 'flv',
+ 'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE',
+ 'description': 'md5:1fabd480c153f97b07add61c44407c82',
+ 'duration': 660,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/',
+ 'md5': '09890226332476a3e3f6f2cb74734aa5',
+ 'info_dict': {
+ 'id': 'rolle-1',
+ 'ext': 'flv',
+ 'title': 'ROLLE 1',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ playlist_url = self._search_regex(
+ r"file\s*:\s*'([^']+)'", webpage, 'playlist url')
+
+ entries = self._extract_xspf_playlist(playlist_url, playlist_id)
+
+ title = self._search_regex(
+ [r'dc:title="([^"]+)"', r'<title> &raquo;([^<]+)</title>'],
+ webpage, 'title').strip()
+ description = self._html_search_regex(
+ r'<p><strong>Description:</strong>(.+?)</p>',
+ webpage, 'description', default=None)
+ duration = parse_duration(self._search_regex(
+ r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)',
+ webpage, 'duration', default=None))
+
+ entries[0].update({
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ })
+
+ return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index d3e667528..d6723ecf2 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -2,19 +2,19 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
+ parse_duration,
parse_iso8601,
- int_or_none,
)
+from ..compat import compat_str
class DiscoveryIE(InfoExtractor):
_VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
- 'md5': '3c69d77d9b0d82bfd5e5932a60f26504',
'info_dict': {
- 'id': 'mission-impossible-outtakes',
- 'ext': 'flv',
+ 'id': '20769',
+ 'ext': 'mp4',
'title': 'Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being'
' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor):
'timestamp': 1303099200,
'upload_date': '20110418',
},
- }
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ }
+ }, {
+ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
+ 'info_dict': {
+ 'id': 'mythbusters-the-simpsons',
+ 'title': 'MythBusters: The Simpsons',
+ },
+ 'playlist_count': 9,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ info = self._download_json(url + '?flat=1', video_id)
- info = self._parse_json(self._search_regex(
- r'(?s)<script type="application/ld\+json">(.*?)</script>',
- webpage, 'video info'), video_id)
+ video_title = info.get('playlist_title') or info.get('video_title')
- return {
- 'id': video_id,
- 'title': info['name'],
- 'url': info['contentURL'],
- 'description': info.get('description'),
- 'thumbnail': info.get('thumbnailUrl'),
- 'timestamp': parse_iso8601(info.get('uploadDate')),
- 'duration': int_or_none(info.get('duration')),
- }
+ entries = [{
+ 'id': compat_str(video_info['id']),
+ 'formats': self._extract_m3u8_formats(
+ video_info['src'], video_id, ext='mp4',
+ note='Download m3u8 information for video %d' % (idx + 1)),
+ 'title': video_info['title'],
+ 'description': video_info.get('description'),
+ 'duration': parse_duration(video_info.get('video_length')),
+ 'webpage_url': video_info.get('href'),
+ 'thumbnail': video_info.get('thumbnailURL'),
+ 'alt_title': video_info.get('secondary_title'),
+ 'timestamp': parse_iso8601(video_info.get('publishedDate')),
+ } for idx, video_info in enumerate(info['playlist'])]
+
+ return self.playlist_result(entries, video_id, video_title)
diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py
deleted file mode 100644
index b88379e06..000000000
--- a/youtube_dl/extractor/divxstage.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from __future__ import unicode_literals
-
-from .novamov import NovaMovIE
-
-
-class DivxStageIE(NovaMovIE):
- IE_NAME = 'divxstage'
- IE_DESC = 'DivxStage'
-
- _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'}
-
- _HOST = 'www.divxstage.eu'
-
- _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
- _TITLE_REGEX = r'<div class="video_det">\s*<strong>([^<]+)</strong>'
- _DESCRIPTION_REGEX = r'<div class="video_det">\s*<strong>[^<]+</strong>\s*<p>([^<]+)</p>'
-
- _TEST = {
- 'url': 'http://www.divxstage.eu/video/57f238e2e5e01',
- 'md5': '63969f6eb26533a1968c4d325be63e72',
- 'info_dict': {
- 'id': '57f238e2e5e01',
- 'ext': 'flv',
- 'title': 'youtubedl test video',
- 'description': 'This is a test video for youtubedl.',
- }
- }
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index f51d88a98..e9ca236d4 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -36,7 +36,8 @@ class DotsubIE(InfoExtractor):
if not video_url:
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r'"file"\s*:\s*\'([^\']+)', webpage, 'video url')
+ [r'<source[^>]+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'],
+ webpage, 'video url')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
new file mode 100644
index 000000000..373b3b4b4
--- /dev/null
+++ b/youtube_dl/extractor/douyutv.py
@@ -0,0 +1,113 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import time
+from .common import InfoExtractor
+from ..utils import (ExtractorError, unescapeHTML)
+from ..compat import (compat_str, compat_basestring)
+
+
+class DouyuTVIE(InfoExtractor):
+ IE_DESC = '斗鱼'
+ _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.douyutv.com/iseven',
+ 'info_dict': {
+ 'id': '17732',
+ 'display_id': 'iseven',
+ 'ext': 'flv',
+ 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:c93d6692dde6fe33809a46edcbecca44',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': '7师傅',
+ 'uploader_id': '431925',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.douyutv.com/85982',
+ 'info_dict': {
+ 'id': '85982',
+ 'display_id': '85982',
+ 'ext': 'flv',
+ 'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'douyu小漠',
+ 'uploader_id': '3769985',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ if video_id.isdigit():
+ room_id = video_id
+ else:
+ page = self._download_webpage(url, video_id)
+ room_id = self._html_search_regex(
+ r'"room_id"\s*:\s*(\d+),', page, 'room id')
+
+ prefix = 'room/%s?aid=android&client_sys=android&time=%d' % (
+ room_id, int(time.time()))
+
+ auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest()
+ config = self._download_json(
+ 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth),
+ video_id)
+
+ data = config['data']
+
+ error_code = config.get('error', 0)
+ if error_code is not 0:
+ error_desc = 'Server reported error %i' % error_code
+ if isinstance(data, (compat_str, compat_basestring)):
+ error_desc += ': ' + data
+ raise ExtractorError(error_desc, expected=True)
+
+ show_status = data.get('show_status')
+ # 1 = live, 2 = offline
+ if show_status == '2':
+ raise ExtractorError(
+ 'Live stream is offline', expected=True)
+
+ base_url = data['rtmp_url']
+ live_path = data['rtmp_live']
+
+ title = self._live_title(unescapeHTML(data['room_name']))
+ description = data.get('show_details')
+ thumbnail = data.get('room_src')
+
+ uploader = data.get('nickname')
+ uploader_id = data.get('owner_uid')
+
+ multi_formats = data.get('rtmp_multi_bitrate')
+ if not isinstance(multi_formats, dict):
+ multi_formats = {}
+ multi_formats['live'] = live_path
+
+ formats = [{
+ 'url': '%s/%s' % (base_url, format_path),
+ 'format_id': format_id,
+ 'preference': 1 if format_id == 'live' else 0,
+ } for format_id, format_path in multi_formats.items()]
+ self._sort_formats(formats)
+
+ return {
+ 'id': room_id,
+ 'display_id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'formats': formats,
+ 'is_live': True,
+ }
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
new file mode 100644
index 000000000..6cda56a7f
--- /dev/null
+++ b/youtube_dl/extractor/dplay.py
@@ -0,0 +1,51 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import time
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class DPlayIE(InfoExtractor):
+ _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P<id>[^/?#]+)'
+
+ _TEST = {
+ 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
+ 'info_dict': {
+ 'id': '3172',
+ 'ext': 'mp4',
+ 'display_id': 'season-1-svensken-lar-sig-njuta-av-livet',
+ 'title': 'Svensken lär sig njuta av livet',
+ 'duration': 2650,
+ },
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(
+ r'data-video-id="(\d+)"', webpage, 'video id')
+
+ info = self._download_json(
+ 'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id,
+ video_id)['data'][0]
+
+ self._set_cookie(
+ 'secure.dplay.se', 'dsc-geo',
+ '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000))
+ # TODO: consider adding support for 'stream_type=hds', it seems to
+ # require setting some cookies
+ manifest_url = self._download_json(
+ 'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id,
+ video_id, 'Getting manifest url for hls stream')['hls']
+ formats = self._extract_m3u8_formats(
+ manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': info['title'],
+ 'formats': formats,
+ 'duration': int_or_none(info.get('video_metadata_length'), scale=1000),
+ }
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
new file mode 100644
index 000000000..d836c1a6c
--- /dev/null
+++ b/youtube_dl/extractor/dramafever.py
@@ -0,0 +1,216 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_urllib_parse,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+ sanitized_Request,
+)
+
+
+class DramaFeverBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
+ _NETRC_MACHINE = 'dramafever'
+
+ _CONSUMER_SECRET = 'DA59dtVXYLxajktV'
+
+ _consumer_secret = None
+
+ def _get_consumer_secret(self):
+ mainjs = self._download_webpage(
+ 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js',
+ None, 'Downloading main.js', fatal=False)
+ if not mainjs:
+ return self._CONSUMER_SECRET
+ return self._search_regex(
+ r"var\s+cs\s*=\s*'([^']+)'", mainjs,
+ 'consumer secret', default=self._CONSUMER_SECRET)
+
+ def _real_initialize(self):
+ self._login()
+ self._consumer_secret = self._get_consumer_secret()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'username': username,
+ 'password': password,
+ }
+
+ request = sanitized_Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ if all(logout_pattern not in response
+ for logout_pattern in ['href="/accounts/logout/"', '>Log out<']):
+ error = self._html_search_regex(
+ r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class DramaFeverIE(DramaFeverBaseIE):
+ IE_NAME = 'dramafever'
+ _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)'
+ _TEST = {
+ 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
+ 'info_dict': {
+ 'id': '4512.1',
+ 'ext': 'flv',
+ 'title': 'Cooking with Shin 4512.1',
+ 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1404336058,
+ 'upload_date': '20140702',
+ 'duration': 343,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url).replace('/', '.')
+
+ try:
+ feed = self._download_json(
+ 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
+ video_id, 'Downloading episode JSON')['channel']['item']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError):
+ raise ExtractorError(
+ 'Currently unavailable in your country.', expected=True)
+ raise
+
+ media_group = feed.get('media-group', {})
+
+ formats = []
+ for media_content in media_group['media-content']:
+ src = media_content.get('@attributes', {}).get('url')
+ if not src:
+ continue
+ ext = determine_ext(src)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src, video_id, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', m3u8_id='hls'))
+ else:
+ formats.append({
+ 'url': src,
+ })
+ self._sort_formats(formats)
+
+ title = media_group.get('media-title')
+ description = media_group.get('media-description')
+ duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
+ thumbnail = self._proto_relative_url(
+ media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
+ timestamp = parse_iso8601(feed.get('pubDate'), ' ')
+
+ subtitles = {}
+ for media_subtitle in media_group.get('media-subTitle', []):
+ lang = media_subtitle.get('@attributes', {}).get('lang')
+ href = media_subtitle.get('@attributes', {}).get('href')
+ if not lang or not href:
+ continue
+ subtitles[lang] = [{
+ 'ext': 'ttml',
+ 'url': href,
+ }]
+
+ series_id, episode_number = video_id.split('.')
+ episode_info = self._download_json(
+ # We only need a single episode info, so restricting page size to one episode
+ # and dealing with page number as with episode number
+ r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1'
+ % (self._consumer_secret, series_id, episode_number),
+ video_id, 'Downloading episode info JSON', fatal=False)
+ if episode_info:
+ value = episode_info.get('value')
+ if value:
+ subfile = value[0].get('subfile') or value[0].get('new_subfile')
+ if subfile and subfile != 'http://www.dramafever.com/st/':
+ subtitles.setdefault('English', []).append({
+ 'ext': 'srt',
+ 'url': subfile,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class DramaFeverSeriesIE(DramaFeverBaseIE):
+ IE_NAME = 'dramafever:series'
+ _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$'
+ _TESTS = [{
+ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/',
+ 'info_dict': {
+ 'id': '4512',
+ 'title': 'Cooking with Shin',
+ 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1',
+ },
+ 'playlist_count': 4,
+ }, {
+ 'url': 'http://www.dramafever.com/drama/124/IRIS/',
+ 'info_dict': {
+ 'id': '124',
+ 'title': 'IRIS',
+ 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862',
+ },
+ 'playlist_count': 20,
+ }]
+
+ _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-)
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+
+ series = self._download_json(
+ 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s'
+ % (self._consumer_secret, series_id),
+ series_id, 'Downloading series JSON')['series'][series_id]
+
+ title = clean_html(series['name'])
+ description = clean_html(series.get('description') or series.get('description_short'))
+
+ entries = []
+ for page_num in itertools.count(1):
+ episodes = self._download_json(
+ 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d'
+ % (self._consumer_secret, series_id, self._PAGE_SIZE, page_num),
+ series_id, 'Downloading episodes JSON page #%d' % page_num)
+ for episode in episodes.get('value', []):
+ episode_url = episode.get('episode_url')
+ if not episode_url:
+ continue
+ entries.append(self.url_result(
+ compat_urlparse.urljoin(url, episode_url),
+ 'DramaFever', episode.get('guid')))
+ if page_num == episodes['num_pages']:
+ break
+
+ return self.playlist_result(entries, series_id, title, description)
diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py
index 7626219ba..8b98b013a 100644
--- a/youtube_dl/extractor/drbonanza.py
+++ b/youtube_dl/extractor/drbonanza.py
@@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517',
- 'md5': 'fe330252ddea607635cf2eb2c99a0af3',
'info_dict': {
'id': '65517',
'ext': 'mp4',
@@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor):
'upload_date': '20110120',
'duration': 3664,
},
+ 'params': {
+ 'skip_download': True, # requires rtmp
+ },
}, {
'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410',
'md5': '6dfe039417e76795fb783c52da3de11d',
@@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor):
'format_id': file['Type'].replace('Video', ''),
'preference': preferencemap.get(file['Type'], -10),
})
+ if format['url'].startswith('rtmp'):
+ rtmp_url = format['url']
+ format['rtmp_live'] = True # --resume does not work
+ if '/bonanza/' in rtmp_url:
+ format['play_path'] = rtmp_url.split('/bonanza/')[1]
formats.append(format)
elif file['Type'] == "Thumb":
thumbnail = file['Location']
@@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor):
description = '%s\n%s\n%s\n' % (
info['Description'], info['Actors'], info['Colophon'])
- for f in formats:
- f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/')
- f['url'] = f['url'].replace('mp4:bonanza', 'bonanza')
self._sort_formats(formats)
display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py
index 69ca75423..8ac8587be 100644
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -3,24 +3,33 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+)
class DreiSatIE(InfoExtractor):
IE_NAME = '3sat'
- _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
- _TEST = {
- 'url': 'http://www.3sat.de/mediathek/index.php?obj=36983',
- 'md5': '9dcfe344732808dbfcc901537973c922',
- 'info_dict': {
- 'id': '36983',
- 'ext': 'mp4',
- 'title': 'Kaffeeland Schweiz',
- 'description': 'md5:cc4424b18b75ae9948b13929a0814033',
- 'uploader': '3sat',
- 'upload_date': '20130622'
- }
- }
+ _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+ _TESTS = [
+ {
+ 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
+ 'md5': 'be37228896d30a88f315b638900a026e',
+ 'info_dict': {
+ 'id': '45918',
+ 'ext': 'mp4',
+ 'title': 'Waidmannsheil',
+ 'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
+ 'uploader': '3sat',
+ 'upload_date': '20140913'
+ }
+ },
+ {
+ 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
+ 'only_matching': True,
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -28,6 +37,15 @@ class DreiSatIE(InfoExtractor):
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
details_doc = self._download_xml(details_url, video_id, 'Downloading video details')
+ status_code = details_doc.find('./status/statuscode')
+ if status_code is not None and status_code.text != 'ok':
+ code = status_code.text
+ if code == 'notVisibleAnymore':
+ message = 'Video %s is not available' % video_id
+ else:
+ message = '%s returned error: %s' % (self.IE_NAME, code)
+ raise ExtractorError(message, expected=True)
+
thumbnail_els = details_doc.findall('.//teaserimage')
thumbnails = [{
'width': int(te.attrib['key'].partition('x')[0]),
diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py
index ca274dff6..639f9182c 100644
--- a/youtube_dl/extractor/drtuber.py
+++ b/youtube_dl/extractor/drtuber.py
@@ -15,7 +15,7 @@ class DrTuberIE(InfoExtractor):
'id': '1740434',
'display_id': 'hot-perky-blonde-naked-golf',
'ext': 'mp4',
- 'title': 'Hot Perky Blonde Naked Golf',
+ 'title': 'hot perky blonde naked golf',
'like_count': int,
'dislike_count': int,
'comment_count': int,
@@ -36,24 +36,24 @@ class DrTuberIE(InfoExtractor):
r'<source src="([^"]+)"', webpage, 'video URL')
title = self._html_search_regex(
- r'<title>([^<]+)\s*-\s*Free', webpage, 'title')
+ [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'],
+ webpage, 'title')
thumbnail = self._html_search_regex(
r'poster="([^"]+)"',
webpage, 'thumbnail', fatal=False)
- like_count = str_to_int(self._html_search_regex(
- r'<span id="rate_likes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
- webpage, 'like count', fatal=False))
- dislike_count = str_to_int(self._html_search_regex(
- r'<span id="rate_dislikes">\s*<img[^>]+>\s*<span>([\d,\.]+)</span>',
- webpage, 'like count', fatal=False))
- comment_count = str_to_int(self._html_search_regex(
- r'<span class="comments_count">([\d,\.]+)</span>',
- webpage, 'comment count', fatal=False))
+ def extract_count(id_, name):
+ return str_to_int(self._html_search_regex(
+ r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_,
+ webpage, '%s count' % name, fatal=False))
+
+ like_count = extract_count('rate_likes', 'like')
+ dislike_count = extract_count('rate_dislikes', 'dislike')
+ comment_count = extract_count('comments_count', 'comment')
cats_str = self._search_regex(
- r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False)
+ r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False)
categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str)
return {
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index d5df18d7c..baa24c6d1 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -1,24 +1,27 @@
+# coding: utf-8
from __future__ import unicode_literals
-from .subtitles import SubtitlesInfoExtractor
-from .common import ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ parse_iso8601,
+)
-class DRTVIE(SubtitlesInfoExtractor):
+class DRTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)'
_TEST = {
- 'url': 'http://www.dr.dk/tv/se/partiets-mand/partiets-mand-7-8',
- 'md5': '4a7e1dd65cdb2643500a3f753c942f25',
+ 'url': 'https://www.dr.dk/tv/se/boern/ultra/panisk-paske/panisk-paske-5',
+ 'md5': 'dc515a9ab50577fa14cc4e4b0265168f',
'info_dict': {
- 'id': 'partiets-mand-7-8',
+ 'id': 'panisk-paske-5',
'ext': 'mp4',
- 'title': 'Partiets mand (7:8)',
- 'description': 'md5:a684b90a8f9336cd4aab94b7647d7862',
- 'timestamp': 1403047940,
- 'upload_date': '20140617',
- 'duration': 1299.040,
+ 'title': 'Panisk Påske (5)',
+ 'description': 'md5:ca14173c5ab24cd26b0fcc074dff391c',
+ 'timestamp': 1426984612,
+ 'upload_date': '20150322',
+ 'duration': 1455,
},
}
@@ -27,6 +30,10 @@ class DRTVIE(SubtitlesInfoExtractor):
webpage = self._download_webpage(url, video_id)
+ if '>Programmet er ikke længere tilgængeligt' in webpage:
+ raise ExtractorError(
+ 'Video %s is not available' % video_id, expected=True)
+
video_id = self._search_regex(
r'data-(?:material-identifier|episode-slug)="([^"]+)"',
webpage, 'video id')
@@ -56,19 +63,31 @@ class DRTVIE(SubtitlesInfoExtractor):
restricted_to_denmark = asset['RestrictedToDenmark']
spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
for link in asset['Links']:
- target = link['Target']
uri = link['Uri']
+ target = link['Target']
format_id = target
- preference = -1 if target == 'HDS' else -2
+ preference = None
if spoken_subtitles:
- preference -= 2
+ preference = -1
format_id += '-spoken-subtitles'
- formats.append({
- 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
- 'format_id': format_id,
- 'ext': link['FileFormat'],
- 'preference': preference,
- })
+ if target == 'HDS':
+ formats.extend(self._extract_f4m_formats(
+ uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+ video_id, preference, f4m_id=format_id))
+ elif target == 'HLS':
+ formats.extend(self._extract_m3u8_formats(
+ uri, video_id, 'mp4', preference=preference,
+ m3u8_id=format_id))
+ else:
+ bitrate = link.get('Bitrate')
+ if bitrate:
+ format_id += '-%s' % bitrate
+ formats.append({
+ 'url': uri,
+ 'format_id': format_id,
+ 'tbr': bitrate,
+ 'ext': link.get('FileFormat'),
+ })
subtitles_list = asset.get('SubtitlesList')
if isinstance(subtitles_list, list):
LANGS = {
@@ -76,7 +95,7 @@ class DRTVIE(SubtitlesInfoExtractor):
}
for subs in subtitles_list:
lang = subs['Language']
- subtitles[LANGS.get(lang, lang)] = subs['Uri']
+ subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}]
if not formats and restricted_to_denmark:
raise ExtractorError(
@@ -84,10 +103,6 @@ class DRTVIE(SubtitlesInfoExtractor):
self._sort_formats(formats)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
-
return {
'id': video_id,
'title': title,
@@ -96,5 +111,5 @@ class DRTVIE(SubtitlesInfoExtractor):
'timestamp': timestamp,
'duration': duration,
'formats': formats,
- 'subtitles': self.extract_subtitles(video_id, subtitles),
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/dump.py b/youtube_dl/extractor/dump.py
index 6b651778a..ff78d4fd2 100644
--- a/youtube_dl/extractor/dump.py
+++ b/youtube_dl/extractor/dump.py
@@ -28,12 +28,12 @@ class DumpIE(InfoExtractor):
video_url = self._search_regex(
r's1.addVariable\("file",\s*"([^"]+)"', webpage, 'video URL')
- thumb = self._og_search_thumbnail(webpage)
- title = self._search_regex(r'<b>([^"]+)</b>', webpage, 'title')
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
return {
'id': video_id,
'title': title,
'url': video_url,
- 'thumbnail': thumb,
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py
new file mode 100644
index 000000000..e5aadcd25
--- /dev/null
+++ b/youtube_dl/extractor/dumpert.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ qualities,
+ sanitized_Request,
+)
+
+
+class DumpertIE(InfoExtractor):
+ _VALID_URL = r'(?P<protocol>https?)://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P<id>[0-9]+/[0-9a-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/',
+ 'md5': '1b9318d7d5054e7dcb9dc7654f21d643',
+ 'info_dict': {
+ 'id': '6646981/951bc60f',
+ 'ext': 'mp4',
+ 'title': 'Ik heb nieuws voor je',
+ 'description': 'Niet schrikken hoor',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://www.dumpert.nl/embed/6675421/dc440fe7/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ protocol = mobj.group('protocol')
+
+ url = '%s://www.dumpert.nl/mediabase/%s' % (protocol, video_id)
+ req = sanitized_Request(url)
+ req.add_header('Cookie', 'nsfw=1; cpc=10')
+ webpage = self._download_webpage(req, video_id)
+
+ files_base64 = self._search_regex(
+ r'data-files="([^"]+)"', webpage, 'data files')
+
+ files = self._parse_json(
+ base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'),
+ video_id)
+
+ quality = qualities(['flv', 'mobile', 'tablet', '720p'])
+
+ formats = [{
+ 'url': video_url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ } for format_id, video_url in files.items() if format_id != 'still']
+ self._sort_formats(formats)
+
+ title = self._html_search_meta(
+ 'title', webpage) or self._og_search_title(webpage)
+ description = self._html_search_meta(
+ 'description', webpage) or self._og_search_description(webpage)
+ thumbnail = files.get('still') or self._og_search_thumbnail(webpage)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py
new file mode 100644
index 000000000..7bbf617d4
--- /dev/null
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
+
+
+class EaglePlatformIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ eagleplatform:(?P<custom_host>[^/]+):|
+ https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id=
+ )
+ (?P<id>\d+)
+ '''
+ _TESTS = [{
+ # http://lenta.ru/news/2015/03/06/navalny/
+ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
+ 'md5': '70f5187fb620f2c1d503b3b22fd4efe3',
+ 'info_dict': {
+ 'id': '227304',
+ 'ext': 'mp4',
+ 'title': 'Навальный вышел на свободу',
+ 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 87,
+ 'view_count': int,
+ 'age_limit': 0,
+ },
+ }, {
+ # http://muz-tv.ru/play/7129/
+ # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true
+ 'url': 'eagleplatform:media.clipyou.ru:12820',
+ 'md5': '90b26344ba442c8e44aa4cf8f301164a',
+ 'info_dict': {
+ 'id': '12820',
+ 'ext': 'mp4',
+ 'title': "'O Sole Mio",
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 216,
+ 'view_count': int,
+ },
+ 'skip': 'Georestricted',
+ }]
+
+ @staticmethod
+ def _handle_error(response):
+ status = int_or_none(response.get('status', 200))
+ if status != 200:
+ raise ExtractorError(' '.join(response['errors']), expected=True)
+
+ def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'):
+ response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note)
+ self._handle_error(response)
+ return response
+
+ def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'):
+ return self._download_json(url_or_request, video_id, note)['data'][0]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
+
+ player_data = self._download_json(
+ 'http://%s/api/player_data?id=%s' % (host, video_id), video_id)
+
+ media = player_data['data']['playlist']['viewports'][0]['medialist'][0]
+
+ title = media['title']
+ description = media.get('description')
+ thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:')
+ duration = int_or_none(media.get('duration'))
+ view_count = int_or_none(media.get('views'))
+
+ age_restriction = media.get('age_restriction')
+ age_limit = None
+ if age_restriction:
+ age_limit = 0 if age_restriction == 'allow_all' else 18
+
+ secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:')
+
+ m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id,
+ 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+
+ mp4_url = self._get_video_url(
+ # Secure mp4 URL is constructed according to Player.prototype.mp4 from
+ # http://lentaru.media.eagleplatform.com/player/player.js
+ re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4', secure_m3u8),
+ video_id, 'Downloading mp4 JSON')
+ formats.append({'url': mp4_url, 'format_id': 'mp4'})
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py
index 9cb1bf301..b1cd4f5d4 100644
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
-from ..compat import (
- compat_urllib_parse,
-)
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
class EHowIE(InfoExtractor):
@@ -26,7 +24,7 @@ class EHowIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL')
- final_url = compat_urllib_parse.unquote(video_url)
+ final_url = compat_urllib_parse_unquote(video_url)
uploader = self._html_search_meta('uploader', webpage)
title = self._og_search_title(webpage).replace(' | eHow', '')
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py
index fb5dbbe2b..0b61ea0ba 100644
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import json
import random
-import re
from .common import InfoExtractor
from ..compat import (
@@ -103,20 +102,23 @@ class EightTracksIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
- json_like = self._search_regex(
- r"(?s)PAGE.mix = (.*?);\n", webpage, 'trax information')
- data = json.loads(json_like)
+ data = self._parse_json(
+ self._search_regex(
+ r"(?s)PAGE\.mix\s*=\s*({.+?});\n", webpage, 'trax information'),
+ playlist_id)
session = str(random.randint(0, 1000000000))
mix_id = data['id']
track_count = data['tracks_count']
duration = data['duration']
avg_song_duration = float(duration) / track_count
+ # duration is sometimes negative, use predefined avg duration
+ if avg_song_duration <= 0:
+ avg_song_duration = 300
first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
next_url = first_url
entries = []
diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py
index 2cba82532..c83845fc2 100644
--- a/youtube_dl/extractor/eitb.py
+++ b/youtube_dl/extractor/eitb.py
@@ -1,39 +1,92 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from .brightcove import BrightcoveIE
-from ..utils import ExtractorError
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+ sanitized_Request,
+)
class EitbIE(InfoExtractor):
IE_NAME = 'eitb.tv'
- _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?eitb\.tv/(?:eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)'
_TEST = {
- 'add_ie': ['Brightcove'],
- 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/',
+ 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/',
'md5': 'edf4436247185adee3ea18ce64c47998',
'info_dict': {
- 'id': '2743577154001',
+ 'id': '4090227752001',
'ext': 'mp4',
'title': '60 minutos (Lasa y Zabala, 30 años)',
- # All videos from eitb has this description in the brightcove info
- 'description': '.',
- 'uploader': 'Euskal Telebista',
+ 'description': 'Programa de reportajes de actualidad.',
+ 'duration': 3996.76,
+ 'timestamp': 1381789200,
+ 'upload_date': '20131014',
+ 'tags': list,
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- chapter_id = mobj.group('chapter_id')
- webpage = self._download_webpage(url, chapter_id)
- bc_url = BrightcoveIE._extract_brightcove_url(webpage)
- if bc_url is None:
- raise ExtractorError('Could not extract the Brightcove url')
- # The BrightcoveExperience object doesn't contain the video id, we set
- # it manually
- bc_url += '&%40videoPlayer={0}'.format(chapter_id)
- return self.url_result(bc_url, BrightcoveIE.ie_key())
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id,
+ video_id, 'Downloading video JSON')
+
+ media = video['web_media'][0]
+
+ formats = []
+ for rendition in media['RENDITIONS']:
+ video_url = rendition.get('PMD_URL')
+ if not video_url:
+ continue
+ tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000)
+ format_id = 'http'
+ if tbr:
+ format_id += '-%d' % int(tbr)
+ formats.append({
+ 'url': rendition['PMD_URL'],
+ 'format_id': format_id,
+ 'width': int_or_none(rendition.get('FRAME_WIDTH')),
+ 'height': int_or_none(rendition.get('FRAME_HEIGHT')),
+ 'tbr': tbr,
+ })
+
+ hls_url = media.get('HLS_SURL')
+ if hls_url:
+ request = sanitized_Request(
+ 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/',
+ headers={'Referer': url})
+ token_data = self._download_json(
+ request, video_id, 'Downloading auth token', fatal=False)
+ if token_data:
+ token = token_data.get('token')
+ if token:
+ m3u8_formats = self._extract_m3u8_formats(
+ '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+
+ hds_url = media.get('HDS_SURL')
+ if hds_url:
+ f4m_formats = self._extract_f4m_formats(
+ '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'),
+ video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'],
+ 'description': media.get('SHORT_DESC_ES') or video.get('desc_group') or media.get('SHORT_DESC_EU'),
+ 'thumbnail': media.get('STILL_URL') or media.get('THUMBNAIL_URL'),
+ 'duration': float_or_none(media.get('LENGTH'), 1000),
+ 'timestamp': parse_iso8601(media.get('BROADCST_DATE'), ' '),
+ 'tags': media.get('TAGS'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py
index fc92ff825..02c6a4615 100644
--- a/youtube_dl/extractor/ellentv.py
+++ b/youtube_dl/extractor/ellentv.py
@@ -6,56 +6,42 @@ import json
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- parse_iso8601,
)
class EllenTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'
- _TESTS = [{
- 'url': 'http://www.ellentv.com/videos/0-7jqrsr18/',
- 'md5': 'e4af06f3bf0d5f471921a18db5764642',
- 'info_dict': {
- 'id': '0-7jqrsr18',
- 'ext': 'mp4',
- 'title': 'What\'s Wrong with These Photos? A Whole Lot',
- 'description': 'md5:35f152dc66b587cf13e6d2cf4fa467f6',
- 'timestamp': 1406876400,
- 'upload_date': '20140801',
- }
- }, {
- 'url': 'http://ellentube.com/videos/0-dvzmabd5/',
- 'md5': '98238118eaa2bbdf6ad7f708e3e4f4eb',
+ _TEST = {
+ 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/',
+ 'md5': '8e3c576bf2e9bfff4d76565f56f94c9c',
'info_dict': {
- 'id': '0-dvzmabd5',
+ 'id': '0_ipq1gsai',
'ext': 'mp4',
- 'title': '1 year old twin sister makes her brother laugh',
- 'description': '1 year old twin sister makes her brother laugh',
- 'timestamp': 1419542075,
- 'upload_date': '20141225',
+ 'title': 'Fast Fingers of Fate',
+ 'description': 'md5:587e79fbbd0d73b148bc596d99ce48e6',
+ 'timestamp': 1428035648,
+ 'upload_date': '20150403',
+ 'uploader_id': 'batchUser',
}
- }]
+ }
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_meta('VideoURL', webpage, 'url')
- title = self._og_search_title(webpage, default=None) or self._search_regex(
- r'pageName\s*=\s*"([^"]+)"', webpage, 'title')
- description = self._html_search_meta(
- 'description', webpage, 'description') or self._og_search_description(webpage)
- timestamp = parse_iso8601(self._search_regex(
- r'<span class="publish-date"><time datetime="([^"]+)">',
- webpage, 'timestamp'))
+ webpage = self._download_webpage(
+ 'http://widgets.ellentube.com/videos/%s' % video_id,
+ video_id)
- return {
- 'id': video_id,
- 'url': video_url,
- 'title': title,
- 'description': description,
- 'timestamp': timestamp,
- }
+ partner_id = self._search_regex(
+ r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id')
+
+ kaltura_id = self._search_regex(
+ [r'id="kaltura_player_([^"]+)"',
+ r"_wb_entry_id\s*:\s*'([^']+)",
+ r'data-kaltura-entry-id="([^"]+)'],
+ webpage, 'kaltura id')
+
+ return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura')
class EllenTVClipsIE(InfoExtractor):
@@ -67,7 +53,7 @@ class EllenTVClipsIE(InfoExtractor):
'id': 'meryl-streep-vanessa-hudgens',
'title': 'Meryl Streep, Vanessa Hudgens',
},
- 'playlist_mincount': 9,
+ 'playlist_mincount': 7,
}
def _real_extract(self, url):
@@ -91,4 +77,8 @@ class EllenTVClipsIE(InfoExtractor):
raise ExtractorError('Failed to download JSON', cause=ve)
def _extract_entries(self, playlist):
- return [self.url_result(item['url'], 'EllenTV') for item in playlist]
+ return [
+ self.url_result(
+ 'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']),
+ 'Kaltura')
+ for item in playlist]
diff --git a/youtube_dl/extractor/embedly.py b/youtube_dl/extractor/embedly.py
new file mode 100644
index 000000000..1cdb11e34
--- /dev/null
+++ b/youtube_dl/extractor/embedly.py
@@ -0,0 +1,16 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class EmbedlyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P<id>[^#&]+)'
+ _TESTS = [{
+ 'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
deleted file mode 100644
index 70f8efe27..000000000
--- a/youtube_dl/extractor/empflix.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import unicode_literals
-
-from .tnaflix import TNAFlixIE
-
-
-class EMPFlixIE(TNAFlixIE):
- _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
-
- _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
- _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
- _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
-
- _TEST = {
- 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
- 'md5': 'b1bc15b6412d33902d6e5952035fcabc',
- 'info_dict': {
- 'id': '33051',
- 'display_id': 'Amateur-Finger-Fuck',
- 'ext': 'mp4',
- 'title': 'Amateur Finger Fuck',
- 'description': 'Amateur solo finger fucking.',
- 'thumbnail': 're:https?://.*\.jpg$',
- 'age_limit': 18,
- }
- }
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index 4ea37ebd9..e4180701d 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -10,7 +10,7 @@ from ..utils import (
class EngadgetIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://www.engadget.com/
- (?:video/5min/(?P<id>\d+)|
+ (?:video(?:/5min)?/(?P<id>\d+)|
[\d/]+/.*?)
'''
diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py
index 4de8d4bc5..e006921ec 100644
--- a/youtube_dl/extractor/eporner.py
+++ b/youtube_dl/extractor/eporner.py
@@ -35,10 +35,7 @@ class EpornerIE(InfoExtractor):
title = self._html_search_regex(
r'<title>(.*?) - EPORNER', webpage, 'title')
- redirect_code = self._html_search_regex(
- r'<script type="text/javascript" src="/config5/%s/([a-f\d]+)/">' % video_id,
- webpage, 'redirect_code')
- redirect_url = 'http://www.eporner.com/config5/%s/%s' % (video_id, redirect_code)
+ redirect_url = 'http://www.eporner.com/config5/%s' % video_id
player_code = self._download_webpage(
redirect_url, display_id, note='Downloading player config')
@@ -69,5 +66,5 @@ class EpornerIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'formats': formats,
- 'age_limit': self._rta_search(webpage),
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py
index 79e2fbd39..7fcd0151d 100644
--- a/youtube_dl/extractor/eroprofile.py
+++ b/youtube_dl/extractor/eroprofile.py
@@ -1,11 +1,20 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ unescapeHTML
+)
class EroProfileIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)'
- _TEST = {
+ _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?'
+ _NETRC_MACHINE = 'eroprofile'
+ _TESTS = [{
'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',
'md5': 'c26f351332edf23e1ea28ce9ec9de32f',
'info_dict': {
@@ -16,19 +25,60 @@ class EroProfileIE(InfoExtractor):
'thumbnail': 're:https?://.*\.jpg',
'age_limit': 18,
}
- }
+ }, {
+ 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file',
+ 'md5': '1baa9602ede46ce904c431f5418d8916',
+ 'info_dict': {
+ 'id': '1133519',
+ 'ext': 'm4v',
+ 'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file',
+ 'thumbnail': 're:https?://.*\.jpg',
+ 'age_limit': 18,
+ },
+ 'skip': 'Requires login',
+ }]
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ query = compat_urllib_parse.urlencode({
+ 'username': username,
+ 'password': password,
+ 'url': 'http://www.eroprofile.com/',
+ })
+ login_url = self._LOGIN_URL + query
+ login_page = self._download_webpage(login_url, None, False)
+
+ m = re.search(r'Your username or password was incorrect\.', login_page)
+ if m:
+ raise ExtractorError(
+ 'Wrong username and/or password.', expected=True)
+
+ self.report_login()
+ redirect_url = self._search_regex(
+ r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url')
+ self._download_webpage(redirect_url, None, False)
+
+ def _real_initialize(self):
+ self._login()
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
+ m = re.search(r'You must be logged in to view this video\.', webpage)
+ if m:
+ self.raise_login_required('This video requires login')
+
video_id = self._search_regex(
[r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
webpage, 'video id', default=None)
- video_url = self._search_regex(
- r'<source src="([^"]+)', webpage, 'video url')
+ video_url = unescapeHTML(self._search_regex(
+ r'<source src="([^"]+)', webpage, 'video url'))
title = self._html_search_regex(
r'Title:</th><td>([^<]+)</td>', webpage, 'title')
thumbnail = self._search_regex(
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py
index 4303feccd..a3d7bbbcb 100644
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -1,85 +1,106 @@
from __future__ import unicode_literals
+import json
+
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
from ..utils import (
- ExtractorError,
- js_to_json,
+ determine_ext,
+ clean_html,
+ int_or_none,
+ float_or_none,
+ sanitized_Request,
)
+def _decrypt_config(key, string):
+ a = ''
+ i = ''
+ r = ''
+
+ while len(a) < (len(string) / 2):
+ a += key
+
+ a = a[0:int(len(string) / 2)]
+
+ t = 0
+ while t < len(string):
+ i += chr(int(string[t] + string[t + 1], 16))
+ t += 2
+
+ icko = [s for s in i]
+
+ for t, c in enumerate(a):
+ r += chr(ord(c) ^ ord(icko[t]))
+
+ return r
+
+
class EscapistIE(InfoExtractor):
- _VALID_URL = r'https?://?(www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])'
- _TEST = {
+ _VALID_URL = r'https?://?(?:www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])'
+ _TESTS = [{
'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
'md5': 'ab3a706c681efca53f0a35f1415cf0d1',
'info_dict': {
'id': '6618',
'ext': 'mp4',
'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
- 'uploader_id': 'the-escapist-presents',
- 'uploader': 'The Escapist Presents',
'title': "Breaking Down Baldur's Gate",
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 264,
+ 'uploader': 'The Escapist',
}
- }
+ }, {
+ 'url': 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10044-Evolve-One-vs-Multiplayer',
+ 'md5': '9e8c437b0dbb0387d3bd3255ca77f6bf',
+ 'info_dict': {
+ 'id': '10044',
+ 'ext': 'mp4',
+ 'description': 'This week, Zero Punctuation reviews Evolve.',
+ 'title': 'Evolve - One vs Multiplayer',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 304,
+ 'uploader': 'The Escapist',
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- uploader_id = self._html_search_regex(
- r"<h1 class='headline'><a href='/videos/view/(.*?)'",
- webpage, 'uploader ID', fatal=False)
- uploader = self._html_search_regex(
- r"<h1 class='headline'>(.*?)</a>",
- webpage, 'uploader', fatal=False)
- description = self._html_search_meta('description', webpage)
-
- raw_title = self._html_search_meta('title', webpage, fatal=True)
- title = raw_title.partition(' : ')[2]
-
- player_url = self._og_search_video_url(webpage, name='player URL')
- config_url = compat_urllib_parse.unquote(self._search_regex(
- r'config=(.*)$', player_url, 'config URL'))
-
- formats = []
-
- def _add_format(name, cfgurl, quality):
- config = self._download_json(
- cfgurl, video_id,
- 'Downloading ' + name + ' configuration',
- 'Unable to download ' + name + ' configuration',
- transform_source=js_to_json)
-
- playlist = config['playlist']
- video_url = next(
- p['url'] for p in playlist
- if p.get('eventCategory') == 'Video')
- formats.append({
- 'url': video_url,
- 'format_id': name,
- 'quality': quality,
- })
-
- _add_format('normal', config_url, quality=0)
- hq_url = (config_url +
- ('&hq=1' if '?' in config_url else config_url + '?hq=1'))
- try:
- _add_format('hq', hq_url, quality=1)
- except ExtractorError:
- pass # That's fine, we'll just use normal quality
+ ims_video = self._parse_json(
+ self._search_regex(
+ r'imsVideo\.play\(({.+?})\);', webpage, 'imsVideo'),
+ video_id)
+ video_id = ims_video['videoID']
+ key = ims_video['hash']
+
+ config_req = sanitized_Request(
+ 'http://www.escapistmagazine.com/videos/'
+ 'vidconfig.php?videoID=%s&hash=%s' % (video_id, key))
+ config_req.add_header('Referer', url)
+ config = self._download_webpage(config_req, video_id, 'Downloading video config')
+
+ data = json.loads(_decrypt_config(key, config))
+ video_data = data['videoData']
+
+ title = clean_html(video_data['title'])
+ duration = float_or_none(video_data.get('duration'), 1000)
+ uploader = video_data.get('publisher')
+
+ formats = [{
+ 'url': video['src'],
+ 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']),
+ 'height': int_or_none(video.get('res')),
+ } for video in data['files']['videos']]
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
- 'description': description,
- 'player_url': player_url,
+ 'description': self._og_search_description(webpage),
+ 'duration': duration,
+ 'uploader': uploader,
}
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
new file mode 100644
index 000000000..e6f8f0337
--- /dev/null
+++ b/youtube_dl/extractor/espn.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ESPNIE(InfoExtractor):
+ _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+ _WORKING = False
+ _TESTS = [{
+ 'url': 'http://espn.go.com/video/clip?id=10365079',
+ 'info_dict': {
+ 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+ 'ext': 'mp4',
+ 'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+ 'description': '',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/nba/recap?gameId=400793786',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._search_regex(
+ r'class="video-play-button"[^>]+data-id="(\d+)',
+ webpage, 'video id')
+
+ player = self._download_webpage(
+ 'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id)
+
+ pcode = self._search_regex(
+ r'["\']pcode=([^"\']+)["\']', player, 'pcode')
+
+ return self.url_result(
+ 'ooyalaexternal:espn:%s:%s' % (video_id, pcode),
+ 'OoyalaExternal')
diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py
new file mode 100644
index 000000000..bf5d2019f
--- /dev/null
+++ b/youtube_dl/extractor/esri.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ parse_filesize,
+ unified_strdate,
+)
+
+
+class EsriVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications',
+ 'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc',
+ 'info_dict': {
+ 'id': '1124',
+ 'ext': 'mp4',
+ 'title': 'ArcGIS Online - Developing Applications',
+ 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 185,
+ 'upload_date': '20120419',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ formats = []
+ for width, height, content in re.findall(
+ r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage):
+ for video_url, ext, filesize in re.findall(
+ r'<a[^>]+href="([^"]+)">([^<]+)&nbsp;\(([^<]+)\)</a>', content):
+ formats.append({
+ 'url': compat_urlparse.urljoin(url, video_url),
+ 'ext': ext.lower(),
+ 'format_id': '%s-%s' % (ext.lower(), height),
+ 'width': int(width),
+ 'height': int(height),
+ 'filesize_approx': parse_filesize(filesize),
+ })
+ self._sort_formats(formats)
+
+ title = self._html_search_meta('title', webpage, 'title')
+ description = self._html_search_meta(
+ 'description', webpage, 'description', fatal=False)
+
+ thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False)
+ if thumbnail:
+ thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail)
+
+ duration = int_or_none(self._search_regex(
+ [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"],
+ webpage, 'duration', fatal=False))
+
+ upload_date = unified_strdate(self._html_search_meta(
+ 'last-modified', webpage, 'upload date', fatal=None))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py
new file mode 100644
index 000000000..adc43919e
--- /dev/null
+++ b/youtube_dl/extractor/europa.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ orderedSet,
+ parse_duration,
+ qualities,
+ unified_strdate,
+ xpath_text
+)
+
+
+class EuropaIE(InfoExtractor):
+ _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)'
+ _TESTS = [{
+ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
+ 'md5': '574f080699ddd1e19a675b0ddf010371',
+ 'info_dict': {
+ 'id': 'I107758',
+ 'ext': 'mp4',
+ 'title': 'TRADE - Wikileaks on TTIP',
+ 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20150811',
+ 'duration': 34,
+ 'view_count': int,
+ 'formats': 'mincount:3',
+ }
+ }, {
+ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ playlist = self._download_xml(
+ 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id)
+
+ def get_item(type_, preference):
+ items = {}
+ for item in playlist.findall('./info/%s/item' % type_):
+ lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
+ if lang and label:
+ items[lang] = label.strip()
+ for p in preference:
+ if items.get(p):
+ return items[p]
+
+ query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ preferred_lang = query.get('sitelang', ('en', ))[0]
+
+ preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
+
+ title = get_item('title', preferred_langs) or video_id
+ description = get_item('description', preferred_langs)
+ thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail')
+ upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
+ duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
+ view_count = int_or_none(xpath_text(playlist, './info/views', 'views'))
+
+ language_preference = qualities(preferred_langs[::-1])
+
+ formats = []
+ for file_ in playlist.findall('./files/file'):
+ video_url = xpath_text(file_, './url')
+ if not video_url:
+ continue
+ lang = xpath_text(file_, './lg')
+ formats.append({
+ 'url': video_url,
+ 'format_id': lang,
+ 'format_note': xpath_text(file_, './lglabel'),
+ 'language_preference': language_preference(lang)
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnmail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/everyonesmixtape.py b/youtube_dl/extractor/everyonesmixtape.py
index d872d828f..493d38af8 100644
--- a/youtube_dl/extractor/everyonesmixtape.py
+++ b/youtube_dl/extractor/everyonesmixtape.py
@@ -3,11 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
@@ -42,7 +40,7 @@ class EveryonesMixtapeIE(InfoExtractor):
playlist_id = mobj.group('id')
pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id
- pllist_req = compat_urllib_request.Request(pllist_url)
+ pllist_req = sanitized_Request(pllist_url)
pllist_req.add_header('X-Requested-With', 'XMLHttpRequest')
playlist_list = self._download_json(
@@ -55,7 +53,7 @@ class EveryonesMixtapeIE(InfoExtractor):
raise ExtractorError('Playlist id not found')
pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no
- pl_req = compat_urllib_request.Request(pl_url)
+ pl_req = sanitized_Request(pl_url)
pl_req.add_header('X-Requested-With', 'XMLHttpRequest')
playlist = self._download_json(
pl_req, playlist_id, note='Downloading playlist info')
diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py
index a38b773e8..1585a03bb 100644
--- a/youtube_dl/extractor/expotv.py
+++ b/youtube_dl/extractor/expotv.py
@@ -33,20 +33,27 @@ class ExpoTVIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
player_key = self._search_regex(
r'<param name="playerKey" value="([^"]+)"', webpage, 'player key')
- config_url = 'http://client.expotv.com/video/config/%s/%s' % (
- video_id, player_key)
config = self._download_json(
- config_url, video_id,
- note='Downloading video configuration')
+ 'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key),
+ video_id, 'Downloading video configuration')
- formats = [{
- 'url': fcfg['file'],
- 'height': int_or_none(fcfg.get('height')),
- 'format_note': fcfg.get('label'),
- 'ext': self._search_regex(
- r'filename=.*\.([a-z0-9_A-Z]+)&', fcfg['file'],
- 'file extension', default=None),
- } for fcfg in config['sources']]
+ formats = []
+ for fcfg in config['sources']:
+ media_url = fcfg.get('file')
+ if not media_url:
+ continue
+ if fcfg.get('type') == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls'))
+ else:
+ formats.append({
+ 'url': media_url,
+ 'height': int_or_none(fcfg.get('height')),
+ 'format_id': fcfg.get('label'),
+ 'ext': self._search_regex(
+ r'filename=.*\.([a-z0-9_A-Z]+)&', media_url,
+ 'file extension', default=None) or fcfg.get('type'),
+ })
self._sort_formats(formats)
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
index 36ba33128..3403581fd 100644
--- a/youtube_dl/extractor/extremetube.py
+++ b/youtube_dl/extractor/extremetube.py
@@ -3,23 +3,20 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urllib_request,
- compat_urllib_parse,
-)
from ..utils import (
+ int_or_none,
+ sanitized_Request,
str_to_int,
)
class ExtremeTubeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<id>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)'
_TESTS = [{
'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
- 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0',
+ 'md5': '344d0c6d50e2f16b06e49ca011d8ac69',
'info_dict': {
- 'id': '652431',
+ 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431',
'ext': 'mp4',
'title': 'Music Video 14 british euro brit european cumshots swallow',
'uploader': 'unknown',
@@ -29,14 +26,18 @@ class ExtremeTubeIE(InfoExtractor):
}, {
'url': 'http://www.extremetube.com/gay/video/abcde-1234',
'only_matching': True,
+ }, {
+ 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.extremetube.com/video/652431',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- url = 'http://www.' + mobj.group('url')
+ video_id = self._match_id(url)
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
@@ -49,19 +50,43 @@ class ExtremeTubeIE(InfoExtractor):
r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>',
webpage, 'view count', fatal=False))
- video_url = compat_urllib_parse.unquote(self._html_search_regex(
- r'video_url=(.+?)&amp;', webpage, 'video_url'))
- path = compat_urllib_parse_urlparse(video_url).path
- format = path.split('/')[5].split('_')[:2]
- format = "-".join(format)
+ flash_vars = self._parse_json(
+ self._search_regex(
+ r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'),
+ video_id)
+
+ formats = []
+ for quality_key, video_url in flash_vars.items():
+ height = int_or_none(self._search_regex(
+ r'quality_(\d+)[pP]$', quality_key, 'height', default=None))
+ if not height:
+ continue
+ f = {
+ 'url': video_url,
+ }
+ mobj = re.search(
+ r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
+ if mobj:
+ height = int(mobj.group('height'))
+ bitrate = int(mobj.group('bitrate'))
+ f.update({
+ 'format_id': '%dp-%dk' % (height, bitrate),
+ 'height': height,
+ 'tbr': bitrate,
+ })
+ else:
+ f.update({
+ 'format_id': '%dp' % height,
+ 'height': height,
+ })
+ formats.append(f)
+ self._sort_formats(formats)
return {
'id': video_id,
'title': video_title,
+ 'formats': formats,
'uploader': uploader,
'view_count': view_count,
- 'url': video_url,
- 'format': format,
- 'format_id': format,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 1ad4e77a8..fd854411b 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -9,14 +9,15 @@ from ..compat import (
compat_http_client,
compat_str,
compat_urllib_error,
- compat_urllib_parse,
- compat_urllib_request,
+ compat_urllib_parse_unquote,
)
from ..utils import (
ExtractorError,
- int_or_none,
limit_length,
+ sanitized_Request,
urlencode_postdata,
+ get_element_by_id,
+ clean_html,
)
@@ -24,8 +25,12 @@ class FacebookIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://(?:\w+\.)?facebook\.com/
(?:[^#]*?\#!/)?
- (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?)
- (?:v|video_id)=(?P<id>[0-9]+)
+ (?:
+ (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?)
+ (?:v|video_id)=|
+ [^/]+/videos/(?:[^/]+/)?
+ )
+ (?P<id>[0-9]+)
(?:.*)'''
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
@@ -38,6 +43,7 @@ class FacebookIE(InfoExtractor):
'id': '637842556329505',
'ext': 'mp4',
'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
+ 'uploader': 'Tennis on Facebook',
}
}, {
'note': 'Video without discernible title',
@@ -46,10 +52,20 @@ class FacebookIE(InfoExtractor):
'id': '274175099429670',
'ext': 'mp4',
'title': 'Facebook video #274175099429670',
- }
+ 'uploader': 'Asif Nawab Butt',
+ },
+ 'expected_warnings': [
+ 'title'
+ ]
}, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
+ }, {
+ 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
+ 'only_matching': True,
}]
def _login(self):
@@ -57,7 +73,7 @@ class FacebookIE(InfoExtractor):
if useremail is None:
return
- login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
+ login_page_req = sanitized_Request(self._LOGIN_URL)
login_page_req.add_header('Cookie', 'locale=en_US')
login_page = self._download_webpage(login_page_req, None,
note='Downloading login page',
@@ -78,7 +94,7 @@ class FacebookIE(InfoExtractor):
'timezone': '-60',
'trynum': '1',
}
- request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))
+ request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
login_results = self._download_webpage(request, None,
@@ -93,7 +109,7 @@ class FacebookIE(InfoExtractor):
r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h'),
'name_action_selected': 'dont_save',
}
- check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
+ check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
check_response = self._download_webpage(check_req, None,
note='Confirming login')
@@ -123,30 +139,40 @@ class FacebookIE(InfoExtractor):
else:
raise ExtractorError('Cannot parse data')
data = dict(json.loads(m.group(1)))
- params_raw = compat_urllib_parse.unquote(data['params'])
+ params_raw = compat_urllib_parse_unquote(data['params'])
params = json.loads(params_raw)
- video_data = params['video_data'][0]
- video_url = video_data.get('hd_src')
- if not video_url:
- video_url = video_data['sd_src']
- if not video_url:
- raise ExtractorError('Cannot find video URL')
+
+ formats = []
+ for format_id, f in params['video_data'].items():
+ if not f or not isinstance(f, list):
+ continue
+ for quality in ('sd', 'hd'):
+ for src_type in ('src', 'src_no_ratelimit'):
+ src = f[0].get('%s_%s' % (quality, src_type))
+ if src:
+ formats.append({
+ 'format_id': '%s_%s_%s' % (format_id, quality, src_type),
+ 'url': src,
+ 'preference': -10 if format_id == 'progressive' else 0,
+ })
+ if not formats:
+ raise ExtractorError('Cannot find video formats')
video_title = self._html_search_regex(
- r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title',
- fatal=False)
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+ default=None)
if not video_title:
video_title = self._html_search_regex(
r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
- webpage, 'alternative title', default=None)
+ webpage, 'alternative title', fatal=False)
video_title = limit_length(video_title, 80)
if not video_title:
video_title = 'Facebook video #%s' % video_id
+ uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
return {
'id': video_id,
'title': video_title,
- 'url': video_url,
- 'duration': int_or_none(video_data.get('video_duration')),
- 'thumbnail': video_data.get('thumbnail_src'),
+ 'formats': formats,
+ 'uploader': uploader,
}
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py
index 3c39ca451..cebdd0193 100644
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -6,9 +6,9 @@ from .common import InfoExtractor
class FazIE(InfoExtractor):
IE_NAME = 'faz.net'
- _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P<id>\d+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
'info_dict': {
'id': '12610585',
@@ -16,7 +16,22 @@ class FazIE(InfoExtractor):
'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
'description': 'md5:1453fbf9a0d041d985a47306192ea253',
},
- }
+ }, {
+ 'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.faz.net/foobarblafasel-13659345.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
index 1ccc1a964..92e8c571f 100644
--- a/youtube_dl/extractor/fc2.py
+++ b/youtube_dl/extractor/fc2.py
@@ -10,12 +10,14 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
+ sanitized_Request,
)
class FC2IE(InfoExtractor):
- _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)?content/(?P<id>[^/]+)'
+ _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)'
IE_NAME = 'fc2'
_NETRC_MACHINE = 'fc2'
_TESTS = [{
@@ -37,6 +39,9 @@ class FC2IE(InfoExtractor):
'password': '(snip)',
'skip': 'requires actual password'
}
+ }, {
+ 'url': 'http://video.fc2.com/en/a/content/20130926eZpARwsF',
+ 'only_matching': True,
}]
def _login(self):
@@ -52,11 +57,8 @@ class FC2IE(InfoExtractor):
'Submit': ' Login ',
}
- # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
- # chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
- login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
- request = compat_urllib_request.Request(
+ login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8')
+ request = sanitized_Request(
'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data)
login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in')
@@ -65,7 +67,7 @@ class FC2IE(InfoExtractor):
return False
# this is also needed
- login_redir = compat_urllib_request.Request('http://id.fc2.com/?mode=redirect&login=done')
+ login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done')
self._download_webpage(
login_redir, None, note='Login redirect', errnote='Login redirect failed')
@@ -80,13 +82,13 @@ class FC2IE(InfoExtractor):
title = self._og_search_title(webpage)
thumbnail = self._og_search_thumbnail(webpage)
- refer = url.replace('/content/', '/a/content/')
+ refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url
mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
info_url = (
"http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&".
- format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.', '%2E')))
+ format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E')))
info_webpage = self._download_webpage(
info_url, video_id, note='Downloading info page')
diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py
new file mode 100644
index 000000000..f1f150ef2
--- /dev/null
+++ b/youtube_dl/extractor/fczenit.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class FczenitIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://fc-zenit.ru/video/gl6785/',
+ 'md5': '458bacc24549173fe5a5aa29174a5606',
+ 'info_dict': {
+ 'id': '6785',
+ 'ext': 'mp4',
+ 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+
+ bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL')
+ bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+
+ formats = [{
+ 'url': furl,
+ 'tbr': tbr,
+ } for furl, tbr in bitrates]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py
deleted file mode 100644
index 3191116d9..000000000
--- a/youtube_dl/extractor/firedrive.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- ExtractorError,
-)
-
-
-class FiredriveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \
- '(?:file|embed)/(?P<id>[0-9a-zA-Z]+)'
- _FILE_DELETED_REGEX = r'<div class="removed_file_image">'
-
- _TESTS = [{
- 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01',
- 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970',
- 'info_dict': {
- 'id': 'FEB892FA160EBD01',
- 'ext': 'flv',
- 'title': 'bbb_theora_486kbit.flv',
- 'thumbnail': 're:^http://.*\.jpg$',
- },
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://firedrive.com/file/%s' % video_id
- webpage = self._download_webpage(url, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- value="([^"]*)"
- ''', webpage))
-
- post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
- # Apparently, this header is required for confirmation to work.
- req.add_header('Host', 'www.firedrive.com')
-
- webpage = self._download_webpage(req, video_id,
- 'Downloading video page')
-
- title = self._search_regex(r'class="external_title_left">(.+)</div>',
- webpage, 'title')
- thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage,
- 'thumbnail', fatal=False)
- if thumbnail is not None:
- thumbnail = 'http:' + thumbnail
-
- ext = self._search_regex(r'type:\s?\'([^\']+)\',',
- webpage, 'extension', fatal=False)
- video_url = self._search_regex(
- r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url')
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'ext': ext,
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py
index 6e015ca16..510d4b108 100644
--- a/youtube_dl/extractor/firsttv.py
+++ b/youtube_dl/extractor/firsttv.py
@@ -1,8 +1,6 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import int_or_none
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index 5b24b921c..2955965d9 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -2,11 +2,15 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
- compat_str,
compat_urllib_parse,
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
+ parse_duration,
+ replace_extension,
)
@@ -14,6 +18,7 @@ class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
_VALID_URL = r'''(?x)
(?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
+ https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
5min:)
(?P<id>\d+)
'''
@@ -27,6 +32,7 @@ class FiveMinIE(InfoExtractor):
'id': '518013791',
'ext': 'mp4',
'title': 'iPad Mini with Retina Display Review',
+ 'duration': 177,
},
},
{
@@ -37,9 +43,52 @@ class FiveMinIE(InfoExtractor):
'id': '518086247',
'ext': 'mp4',
'title': 'How to Make a Next-Level Fruit Salad',
+ 'duration': 184,
},
},
]
+ _ERRORS = {
+ 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.',
+ 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.',
+ 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.',
+ 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.',
+ 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
+ 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
+ }
+ _QUALITIES = {
+ 1: {
+ 'width': 640,
+ 'height': 360,
+ },
+ 2: {
+ 'width': 854,
+ 'height': 480,
+ },
+ 4: {
+ 'width': 1280,
+ 'height': 720,
+ },
+ 8: {
+ 'width': 1920,
+ 'height': 1080,
+ },
+ 16: {
+ 'width': 640,
+ 'height': 360,
+ },
+ 32: {
+ 'width': 854,
+ 'height': 480,
+ },
+ 64: {
+ 'width': 1280,
+ 'height': 720,
+ },
+ 128: {
+ 'width': 640,
+ 'height': 360,
+ },
+ }
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -58,26 +107,36 @@ class FiveMinIE(InfoExtractor):
'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
video_id)
if not response['success']:
- err_msg = response['errorMessage']
- if err_msg == 'ErrorVideoUserNotGeo':
- msg = 'Video not available from your location'
- else:
- msg = 'Aol said: %s' % err_msg
- raise ExtractorError(msg, expected=True, video_id=video_id)
+ raise ExtractorError(
+ '%s said: %s' % (
+ self.IE_NAME,
+ self._ERRORS.get(response['errorMessage'], response['errorMessage'])),
+ expected=True)
info = response['binding'][0]
- second_id = compat_str(int(video_id[:-2]) + 1)
formats = []
- for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]:
- if any(r['ID'] == quality for r in info['Renditions']):
+ parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
+ compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
+ for rendition in info['Renditions']:
+ if rendition['RenditionType'] == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls'))
+ elif rendition['RenditionType'] == 'aac':
+ continue
+ else:
+ rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
+ quality = self._QUALITIES.get(rendition['ID'], {})
formats.append({
- 'format_id': compat_str(quality),
- 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality),
- 'height': height,
+ 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']),
+ 'url': rendition_url,
+ 'width': quality.get('width'),
+ 'height': quality.get('height'),
})
+ self._sort_formats(formats)
return {
'id': video_id,
'title': info['Title'],
+ 'thumbnail': info.get('ThumbURL'),
+ 'duration': parse_duration(info.get('Duration')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py
new file mode 100644
index 000000000..13fbc4da2
--- /dev/null
+++ b/youtube_dl/extractor/fivetv.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FiveTVIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ http://
+ (?:www\.)?5-tv\.ru/
+ (?:
+ (?:[^/]+/)+(?P<id>\d+)|
+ (?P<path>[^/?#]+)(?:[/?#])?
+ )
+ '''
+
+ _TESTS = [{
+ 'url': 'http://5-tv.ru/news/96814/',
+ 'md5': 'bbff554ad415ecf5416a2f48c22d9283',
+ 'info_dict': {
+ 'id': '96814',
+ 'ext': 'mp4',
+ 'title': 'Россияне выбрали имя для общенациональной платежной системы',
+ 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 180,
+ },
+ }, {
+ 'url': 'http://5-tv.ru/video/1021729/',
+ 'info_dict': {
+ 'id': '1021729',
+ 'ext': 'mp4',
+ 'title': '3D принтер',
+ 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 180,
+ },
+ }, {
+ 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails',
+ 'info_dict': {
+ 'id': 'glavnoe',
+ 'ext': 'mp4',
+ 'title': 'Итоги недели с 8 по 14 июня 2015 года',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/films/1507502/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/programs/broadcast/508713/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://5-tv.ru/angel/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('path')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+ webpage, 'video url')
+
+ title = self._og_search_title(webpage, default=None) or self._search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title')
+ duration = int_or_none(self._og_search_property(
+ 'video:duration', webpage, 'duration', default=None))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index 190d9f9ad..40ea27895 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -1,13 +1,12 @@
from __future__ import unicode_literals
import re
-import random
-import json
from .common import InfoExtractor
from ..utils import (
- get_element_by_id,
clean_html,
+ determine_ext,
+ ExtractorError,
)
@@ -17,66 +16,40 @@ class FKTVIE(InfoExtractor):
_TEST = {
'url': 'http://fernsehkritik.tv/folge-1',
+ 'md5': '21f0b0c99bce7d5b524eb1b17b1c6d79',
'info_dict': {
- 'id': '00011',
- 'ext': 'flv',
+ 'id': '1',
+ 'ext': 'mp4',
'title': 'Folge 1 vom 10. April 2007',
- 'description': 'md5:fb4818139c7cfe6907d4b83412a6864f',
+ 'thumbnail': 're:^https?://.*\.jpg$',
},
}
def _real_extract(self, url):
- episode = int(self._match_id(url))
-
- video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode
- start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode,
- episode)
- playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage,
- 'playlist', flags=re.DOTALL)
- files = json.loads(re.sub('{[^{}]*?}', '{}', playlist))
-
- videos = []
- for i, _ in enumerate(files, 1):
- video_id = '%04d%d' % (episode, i)
- video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i)
- videos.append({
- 'ext': 'flv',
- 'id': video_id,
- 'url': video_url,
- 'title': clean_html(get_element_by_id('eptitle', start_webpage)),
- 'description': clean_html(get_element_by_id('contentlist', start_webpage)),
- 'thumbnail': video_thumbnail
- })
- return {
- '_type': 'multi_video',
- 'entries': videos,
- 'id': 'folge-%s' % episode,
- }
-
-
-class FKTVPosteckeIE(InfoExtractor):
- IE_NAME = 'fernsehkritik.tv:postecke'
- _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/inline-video/postecke\.php\?(.*&)?ep=(?P<ep>[0-9]+)(&|$)'
- _TEST = {
- 'url': 'http://fernsehkritik.tv/inline-video/postecke.php?iframe=true&width=625&height=440&ep=120',
- 'md5': '262f0adbac80317412f7e57b4808e5c4',
- 'info_dict': {
- 'id': '0120',
- 'ext': 'flv',
- 'title': 'Postecke 120',
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- episode = int(mobj.group('ep'))
-
- server = random.randint(2, 4)
- video_id = '%04d' % episode
- video_url = 'http://dl%d.fernsehkritik.tv/postecke/postecke%d.flv' % (server, episode)
- video_title = 'Postecke %d' % episode
+ episode = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://fernsehkritik.tv/folge-%s/play' % episode, episode)
+ title = clean_html(self._html_search_regex(
+ '<h3>([^<]+)</h3>', webpage, 'title'))
+ matches = re.search(
+ r'(?s)<video(?:(?!poster)[^>])+(?:poster="([^"]+)")?[^>]*>(.*)</video>',
+ webpage)
+ if matches is None:
+ raise ExtractorError('Unable to extract the video')
+
+ poster, sources = matches.groups()
+ if poster is None:
+ self.report_warning('unable to extract thumbnail')
+
+ urls = re.findall(r'<source[^>]+src="([^"]+)"', sources)
+ formats = [{
+ 'url': furl,
+ 'format_id': determine_ext(furl),
+ } for furl in urls]
return {
- 'id': video_id,
- 'url': video_url,
- 'title': video_title,
+ 'id': episode,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': poster,
}
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py
index 0c858b654..91cd46e76 100644
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -5,7 +5,8 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- unescapeHTML,
+ find_xpath_attr,
+ sanitized_Request,
)
@@ -29,25 +30,31 @@ class FlickrIE(InfoExtractor):
video_id = mobj.group('id')
video_uploader_id = mobj.group('uploader_id')
webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
- webpage = self._download_webpage(webpage_url, video_id)
+ req = sanitized_Request(webpage_url)
+ req.add_header(
+ 'User-Agent',
+ # it needs a more recent version
+ 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)')
+ webpage = self._download_webpage(req, video_id)
- secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, 'secret')
+ secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret')
first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
- first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
+ first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage')
- node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
- first_xml, 'node_id')
+ node_id = find_xpath_attr(
+ first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id',
+ 'id').text
second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
- second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
+ second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage')
self.report_extraction(video_id)
- mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
- if mobj is None:
+ stream = second_xml.find('.//STREAM')
+ if stream is None:
raise ExtractorError('Unable to extract video url')
- video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
+ video_url = stream.attrib['APP'] + stream.attrib['FULLPATH']
return {
'id': video_id,
diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py
index 0fb29de75..75399fa7d 100644
--- a/youtube_dl/extractor/folketinget.py
+++ b/youtube_dl/extractor/folketinget.py
@@ -30,6 +30,10 @@ class FolketingetIE(InfoExtractor):
'upload_date': '20141120',
'duration': 3960,
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py
new file mode 100644
index 000000000..4c7dbca40
--- /dev/null
+++ b/youtube_dl/extractor/footyroom.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FootyRoomIE(InfoExtractor):
+ _VALID_URL = r'http://footyroom\.com/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/',
+ 'info_dict': {
+ 'id': 'schalke-04-0-2-real-madrid-2015-02',
+ 'title': 'Schalke 04 0 – 2 Real Madrid',
+ },
+ 'playlist_count': 3,
+ }, {
+ 'url': 'http://footyroom.com/georgia-0-2-germany-2015-03/',
+ 'info_dict': {
+ 'id': 'georgia-0-2-germany-2015-03',
+ 'title': 'Georgia 0 – 2 Germany',
+ },
+ 'playlist_count': 1,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'VideoSelector\.load\((\[.+?\])\);', webpage, 'video selector'),
+ playlist_id)
+
+ playlist_title = self._og_search_title(webpage)
+
+ entries = []
+ for video in playlist:
+ payload = video.get('payload')
+ if not payload:
+ continue
+ playwire_url = self._search_regex(
+ r'data-config="([^"]+)"', payload,
+ 'playwire url', default=None)
+ if playwire_url:
+ entries.append(self.url_result(self._proto_relative_url(
+ playwire_url, 'http:'), 'Playwire'))
+
+ return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index b2284ab01..fc4a5a0fb 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -3,12 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
parse_duration,
parse_iso8601,
+ sanitized_Request,
str_to_int,
)
@@ -32,6 +30,7 @@ class FourTubeIE(InfoExtractor):
'view_count': int,
'like_count': int,
'categories': list,
+ 'age_limit': 18,
}
}
@@ -45,10 +44,10 @@ class FourTubeIE(InfoExtractor):
thumbnail = self._html_search_meta('thumbnailUrl', webpage)
uploader_id = self._html_search_regex(
r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
- webpage, 'uploader id')
+ webpage, 'uploader id', fatal=False)
uploader = self._html_search_regex(
r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
- webpage, 'uploader')
+ webpage, 'uploader', fatal=False)
categories_html = self._search_regex(
r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>',
@@ -67,13 +66,24 @@ class FourTubeIE(InfoExtractor):
webpage, 'like count', fatal=False))
duration = parse_duration(self._html_search_meta('duration', webpage))
- params_js = self._search_regex(
- r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
- webpage, 'initialization parameters'
- )
- params = self._parse_json('[%s]' % params_js, video_id)
- media_id = params[0]
- sources = ['%s' % p for p in params[2]]
+ media_id = self._search_regex(
+ r'<button[^>]+data-id=(["\'])(?P<id>\d+)\1[^>]+data-quality=', webpage,
+ 'media id', default=None, group='id')
+ sources = [
+ quality
+ for _, quality in re.findall(r'<button[^>]+data-quality=(["\'])(.+?)\1', webpage)]
+ if not (media_id and sources):
+ player_js = self._download_webpage(
+ self._search_regex(
+ r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2',
+ webpage, 'player JS', group='url'),
+ video_id, 'Downloading player JS')
+ params_js = self._search_regex(
+ r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
+ player_js, 'initialization parameters')
+ params = self._parse_json('[%s]' % params_js, video_id)
+ media_id = params[0]
+ sources = ['%s' % p for p in params[2]]
token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format(
media_id, '+'.join(sources))
@@ -81,7 +91,7 @@ class FourTubeIE(InfoExtractor):
b'Content-Type': b'application/x-www-form-urlencoded',
b'Origin': b'http://www.4tube.com',
}
- token_req = compat_urllib_request.Request(token_url, b'{}', headers)
+ token_req = sanitized_Request(token_url, b'{}', headers)
tokens = self._download_json(token_req, video_id)
formats = [{
'url': tokens[format]['token'],
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py
index 917f76b1e..3a4a59135 100644
--- a/youtube_dl/extractor/foxnews.py
+++ b/youtube_dl/extractor/foxnews.py
@@ -1,5 +1,7 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
parse_iso8601,
@@ -8,7 +10,8 @@ from ..utils import (
class FoxNewsIE(InfoExtractor):
- _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
+ IE_DESC = 'Fox News and Fox Business Video'
+ _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
_TESTS = [
{
'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips',
@@ -42,13 +45,19 @@ class FoxNewsIE(InfoExtractor):
'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
'only_matching': True,
},
+ {
+ 'url': 'http://video.foxbusiness.com/v/4442309889001',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ host = mobj.group('host')
video = self._download_json(
- 'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id)
+ 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id)
item = video['channel']['item']
title = item['title']
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
new file mode 100644
index 000000000..df7665176
--- /dev/null
+++ b/youtube_dl/extractor/foxsports.py
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class FoxSportsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+
+ _TEST = {
+ 'url': 'http://www.foxsports.com/video?vid=432609859715',
+ 'info_dict': {
+ 'id': 'gA0bHB3Ladz3',
+ 'ext': 'flv',
+ 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
+ 'description': 'Courtney Lee talks about Memphis being focused.',
+ },
+ 'add_ie': ['ThePlatform'],
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ config = self._parse_json(
+ self._search_regex(
+ r"data-player-config='([^']+)'", webpage, 'data player config'),
+ video_id)
+
+ return self.url_result(smuggle_url(
+ config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True}))
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 170d68075..8e60cf60f 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -6,16 +6,15 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
clean_html,
ExtractorError,
int_or_none,
parse_duration,
+ determine_ext,
)
+from .dailymotion import DailymotionCloudIE
class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -50,22 +49,20 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
if not video_url:
continue
format_id = video['format']
- if video_url.endswith('.f4m'):
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
if georestricted:
# See https://github.com/rg3/youtube-dl/issues/3963
# m3u8 urls work fine
continue
- video_url_parsed = compat_urllib_parse_urlparse(video_url)
f4m_url = self._download_webpage(
- 'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path,
+ 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url,
video_id, 'Downloading f4m manifest token', fatal=False)
if f4m_url:
- f4m_formats = self._extract_f4m_formats(f4m_url, video_id)
- for f4m_format in f4m_formats:
- f4m_format['preference'] = 1
- formats.extend(f4m_formats)
- elif video_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4'))
+ formats.extend(self._extract_f4m_formats(
+ f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@@ -81,28 +78,48 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
})
self._sort_formats(formats)
+ title = info['titre']
+ subtitle = info.get('sous_titre')
+ if subtitle:
+ title += ' - %s' % subtitle
+
+ subtitles = {}
+ subtitles_list = [{
+ 'url': subformat['url'],
+ 'ext': subformat.get('format'),
+ } for subformat in info.get('subtitles', []) if subformat.get('url')]
+ if subtitles_list:
+ subtitles['fr'] = subtitles_list
+
return {
'id': video_id,
- 'title': info['titre'],
+ 'title': title,
'description': clean_html(info['synopsis']),
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
- 'duration': parse_duration(info['duree']),
+ 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
'timestamp': int_or_none(info['diffusion']['timestamp']),
'formats': formats,
+ 'subtitles': subtitles,
}
class PluzzIE(FranceTVBaseInfoExtractor):
IE_NAME = 'pluzz.francetv.fr'
- _VALID_URL = r'https?://pluzz\.francetv\.fr/videos/(.*?)\.html'
+ _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html'
# Can't use tests, videos expire in 7 days
def _real_extract(self, url):
- title = re.match(self._VALID_URL, url).group(1)
- webpage = self._download_webpage(url, title)
- video_id = self._search_regex(
- r'data-diffusion="(\d+)"', webpage, 'ID')
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._html_search_meta(
+ 'id_video', webpage, 'video id', default=None)
+ if not video_id:
+ video_id = self._search_regex(
+ r'data-diffusion=["\'](\d+)', webpage, 'video id')
+
return self._extract_video(video_id, 'Pluzz')
@@ -118,6 +135,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'title': 'Soir 3',
'upload_date': '20130826',
'timestamp': 1377548400,
+ 'subtitles': {
+ 'fr': 'mincount:2',
+ },
},
}, {
'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
@@ -131,12 +151,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'skip_download': 'HLS (reqires ffmpeg)'
},
'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.',
+ }, {
+ 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
+ 'md5': 'f485bda6e185e7d15dbc69b72bae993e',
+ 'info_dict': {
+ 'id': '556e03339473995ee145930c',
+ 'ext': 'mp4',
+ 'title': 'Les entreprises familiales : le secret de la réussite',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ }
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
+
+ dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+ if dmcloud_url:
+ return self.url_result(dmcloud_url, 'DailymotionCloud')
+
video_id, catalogue = self._search_regex(
r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
return self._extract_video(video_id, catalogue)
@@ -145,11 +179,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
class FranceTVIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetv'
IE_DESC = 'France 2, 3, 4, 5 and Ô'
- _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/
- (?:
- emissions/.*?/(videos|emissions)/(?P<id>[^/?]+)
- | (emissions?|jt)/(?P<key>[^/?]+)
- )'''
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?france[2345o]\.fr/
+ (?:
+ emissions/[^/]+/(?:videos|diffusions)|
+ emission/[^/]+|
+ videos|
+ jt
+ )
+ /|
+ embed\.francetv\.fr/\?ue=
+ )
+ (?P<id>[^/?]+)
+ '''
_TESTS = [
# france2
@@ -193,37 +237,59 @@ class FranceTVIE(FranceTVBaseInfoExtractor):
},
# france5
{
- 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968',
- 'md5': '78f0f4064f9074438e660785bbf2c5d9',
+ 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1',
+ 'md5': 'f6c577df3806e26471b3d21631241fd0',
'info_dict': {
- 'id': '108961659',
+ 'id': '123327454',
'ext': 'flv',
- 'title': 'C à dire ?!',
- 'description': 'md5:1a4aeab476eb657bf57c4ff122129f81',
- 'upload_date': '20140915',
- 'timestamp': 1410795000,
+ 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?',
+ 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4',
+ 'upload_date': '20150831',
+ 'timestamp': 1441035120,
},
},
# franceo
{
- 'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013',
- 'md5': '52f0bfe202848b15915a2f39aaa8981b',
+ 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
+ 'md5': '47d5816d3b24351cdce512ad7ab31da8',
'info_dict': {
- 'id': '108634970',
+ 'id': '125377621',
'ext': 'flv',
- 'title': 'Infô Afrique',
- 'description': 'md5:ebf346da789428841bee0fd2a935ea55',
- 'upload_date': '20140915',
- 'timestamp': 1410822000,
+ 'title': 'Infô soir',
+ 'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
+ 'upload_date': '20150718',
+ 'timestamp': 1437241200,
+ 'duration': 414,
},
},
+ {
+ # francetv embed
+ 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
+ 'info_dict': {
+ 'id': 'EV_30231',
+ 'ext': 'flv',
+ 'title': 'Alcaline, le concert avec Calogero',
+ 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+ 'upload_date': '20150226',
+ 'timestamp': 1424989860,
+ 'duration': 5400,
+ },
+ },
+ {
+ 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.franceo.fr/videos/125377617',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- webpage = self._download_webpage(url, mobj.group('key') or mobj.group('id'))
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
video_id, catalogue = self._html_search_regex(
- r'href="http://videos\.francetv\.fr/video/([^@]+@[^"]+)"',
+ r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
webpage, 'video ID').split('@')
return self._extract_video(video_id, catalogue)
@@ -260,22 +326,28 @@ class CultureboxIE(FranceTVBaseInfoExtractor):
_VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)'
_TEST = {
- 'url': 'http://culturebox.francetvinfo.fr/festivals/dans-les-jardins-de-william-christie/dans-les-jardins-de-william-christie-le-camus-162553',
- 'md5': '5ad6dec1ffb2a3fbcb20cc4b744be8d6',
+ 'url': 'http://culturebox.francetvinfo.fr/live/musique/musique-classique/le-livre-vermeil-de-montserrat-a-la-cathedrale-delne-214511',
+ 'md5': '9b88dc156781c4dbebd4c3e066e0b1d6',
'info_dict': {
- 'id': 'EV_22853',
+ 'id': 'EV_50111',
'ext': 'flv',
- 'title': 'Dans les jardins de William Christie - Le Camus',
- 'description': 'md5:4710c82315c40f0c865ca8b9a68b5299',
- 'upload_date': '20140829',
- 'timestamp': 1409317200,
+ 'title': "Le Livre Vermeil de Montserrat à la Cathédrale d'Elne",
+ 'description': 'md5:f8a4ad202e8fe533e2c493cc12e739d9',
+ 'upload_date': '20150320',
+ 'timestamp': 1426892400,
+ 'duration': 2760.9,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
+
webpage = self._download_webpage(url, name)
+
+ if ">Ce live n'est plus disponible en replay<" in webpage:
+ raise ExtractorError('Video %s is not available' % name, expected=True)
+
video_id, catalogue = self._search_regex(
r'"http://videos\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video id').split('@')
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index a49fc1151..7f21d7410 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -45,20 +45,35 @@ class FunnyOrDieIE(InfoExtractor):
links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
- bitrates = self._html_search_regex(r'<source src="[^"]+/v,((?:\d+,)+)\.mp4\.csmil', webpage, 'video bitrates')
- bitrates = [int(b) for b in bitrates.rstrip(',').split(',')]
- bitrates.sort()
+ m3u8_url = self._search_regex(
+ r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8)\1',
+ webpage, 'm3u8 url', default=None, group='url')
formats = []
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+
+ bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)[,/]', m3u8_url)]
+ bitrates.sort()
+
for bitrate in bitrates:
for link in links:
formats.append({
- 'url': '%s%d.%s' % (link[0], bitrate, link[1]),
+ 'url': self._proto_relative_url('%s%d.%s' % (link[0], bitrate, link[1])),
'format_id': '%s-%d' % (link[1], bitrate),
'vbr': bitrate,
})
+ subtitles = {}
+ for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage):
+ subtitles[src_lang] = [{
+ 'ext': src.split('/')[-1],
+ 'url': 'http://www.funnyordie.com%s' % src,
+ }]
+
post_json = self._search_regex(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
post = json.loads(post_json)
@@ -69,4 +84,5 @@ class FunnyOrDieIE(InfoExtractor):
'description': post.get('description'),
'thumbnail': post.get('picture'),
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/gamersyde.py b/youtube_dl/extractor/gamersyde.py
new file mode 100644
index 000000000..d545e01bb
--- /dev/null
+++ b/youtube_dl/extractor/gamersyde.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ parse_duration,
+ remove_start,
+)
+
+
+class GamersydeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gamersyde\.com/hqstream_(?P<display_id>[\da-z_]+)-(?P<id>\d+)_[a-z]{2}\.html'
+ _TEST = {
+ 'url': 'http://www.gamersyde.com/hqstream_bloodborne_birth_of_a_hero-34371_en.html',
+ 'md5': 'f38d400d32f19724570040d5ce3a505f',
+ 'info_dict': {
+ 'id': '34371',
+ 'ext': 'mp4',
+ 'duration': 372,
+ 'title': 'Bloodborne - Birth of a hero',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'(?s)playlist: \[({.+?})\]\s*}\);', webpage, 'files'),
+ display_id, transform_source=js_to_json)
+
+ formats = []
+ for source in playlist['sources']:
+ video_url = source.get('file')
+ if not video_url:
+ continue
+ format_id = source.get('label')
+ f = {
+ 'url': video_url,
+ 'format_id': format_id,
+ }
+ m = re.search(r'^(?P<height>\d+)[pP](?P<fps>\d+)fps', format_id)
+ if m:
+ f.update({
+ 'height': int(m.group('height')),
+ 'fps': int(m.group('fps')),
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ title = remove_start(playlist['title'], '%s - ' % video_id)
+ thumbnail = playlist.get('image')
+ duration = parse_duration(self._search_regex(
+ r'Length:</label>([^<]+)<', webpage, 'duration', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 47373e215..b3f1bafcc 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -5,7 +5,7 @@ import json
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urlparse,
)
from ..utils import (
@@ -14,8 +14,8 @@ from ..utils import (
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
- _TEST = {
+ _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
+ _TESTS = [{
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
'info_dict': {
@@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor):
'ext': 'mp4',
'title': 'Arma 3 - Community Guide: SITREP I',
'description': 'Check out this video where some of the basics of Arma 3 is explained.',
- }
- }
+ },
+ }, {
+ 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
+ 'info_dict': {
+ 'id': 'gs-2300-6424837',
+ 'ext': 'flv',
+ 'title': 'The Witcher 3: Wild Hunt [Xbox ONE] - Now Playing',
+ 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
+ },
+ }]
def _real_extract(self, url):
page_id = self._match_id(url)
@@ -32,30 +40,42 @@ class GameSpotIE(InfoExtractor):
data_video_json = self._search_regex(
r'data-video=["\'](.*?)["\']', webpage, 'data video')
data_video = json.loads(unescapeHTML(data_video_json))
+ streams = data_video['videoStreams']
- # Transform the manifest url to a link to the mp4 files
- # they are used in mobile devices.
- f4m_url = data_video['videoStreams']['f4m_stream']
- f4m_path = compat_urlparse.urlparse(f4m_url).path
- QUALITIES_RE = r'((,\d+)+,?)'
- qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
- http_path = f4m_path[1:].split('/', 1)[1]
- http_template = re.sub(QUALITIES_RE, r'%s', http_path)
- http_template = http_template.replace('.csmil/manifest.f4m', '')
- http_template = compat_urlparse.urljoin(
- 'http://video.gamespotcdn.com/', http_template)
formats = []
- for q in qualities:
- formats.append({
- 'url': http_template % q,
- 'ext': 'mp4',
- 'format_id': q,
- })
+ f4m_url = streams.get('f4m_stream')
+ if f4m_url is not None:
+ # Transform the manifest url to a link to the mp4 files
+ # they are used in mobile devices.
+ f4m_path = compat_urlparse.urlparse(f4m_url).path
+ QUALITIES_RE = r'((,\d+)+,?)'
+ qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
+ http_path = f4m_path[1:].split('/', 1)[1]
+ http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+ http_template = http_template.replace('.csmil/manifest.f4m', '')
+ http_template = compat_urlparse.urljoin(
+ 'http://video.gamespotcdn.com/', http_template)
+ for q in qualities:
+ formats.append({
+ 'url': http_template % q,
+ 'ext': 'mp4',
+ 'format_id': q,
+ })
+ else:
+ for quality in ['sd', 'hd']:
+ # It's actually a link to a flv file
+ flv_url = streams.get('f4m_{0}'.format(quality))
+ if flv_url is not None:
+ formats.append({
+ 'url': flv_url,
+ 'ext': 'flv',
+ 'format_id': quality,
+ })
return {
'id': data_video['guid'],
'display_id': page_id,
- 'title': compat_urllib_parse.unquote(data_video['title']),
+ 'title': compat_urllib_parse_unquote(data_video['title']),
'formats': formats,
'description': self._html_search_meta('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py
index 7591a151e..590ccf526 100644
--- a/youtube_dl/extractor/gamestar.py
+++ b/youtube_dl/extractor/gamestar.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -31,7 +33,7 @@ class GameStarIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
og_title = self._og_search_title(webpage)
- title = og_title.replace(' - Video bei GameStar.de', '').strip()
+ title = re.sub(r'\s*- Video (bei|-) GameStar\.de$', '', og_title)
url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id
diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py
new file mode 100644
index 000000000..ea32b621c
--- /dev/null
+++ b/youtube_dl/extractor/gazeta.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class GazetaIE(InfoExtractor):
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)'
+ _TESTS = [{
+ 'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml',
+ 'md5': 'd49c9bdc6e5a7888f27475dc215ee789',
+ 'info_dict': {
+ 'id': '205566',
+ 'ext': 'mp4',
+ 'title': '«70–80 процентов гражданских в Донецке на грани голода»',
+ 'description': 'md5:38617526050bd17b234728e7f9620a71',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ }, {
+ 'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ display_id = mobj.group('id')
+ embed_url = '%s?p=embed' % mobj.group('url')
+ embed_page = self._download_webpage(
+ embed_url, display_id, 'Downloading embed page')
+
+ video_id = self._search_regex(
+ r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id')
+
+ return self.url_result(
+ 'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform')
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index fed968f51..3befd3e7b 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -3,20 +3,24 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
+from ..compat import compat_urllib_parse
+from ..utils import (
+ remove_end,
+ HEADRequest,
+ sanitized_Request,
)
class GDCVaultIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
+ _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)?'
+ _NETRC_MACHINE = 'gdcvault'
_TESTS = [
{
'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
'md5': '7ce8388f544c88b7ac11c7ab1b593704',
'info_dict': {
'id': '1019721',
+ 'display_id': 'Doki-Doki-Universe-Sweet-Simple',
'ext': 'mp4',
'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
}
@@ -25,6 +29,7 @@ class GDCVaultIE(InfoExtractor):
'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
'info_dict': {
'id': '1015683',
+ 'display_id': 'Embracing-the-Dark-Art-of',
'ext': 'flv',
'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
},
@@ -37,10 +42,15 @@ class GDCVaultIE(InfoExtractor):
'md5': 'a5eb77996ef82118afbbe8e48731b98e',
'info_dict': {
'id': '1015301',
+ 'display_id': 'Thexder-Meets-Windows-95-or',
'ext': 'flv',
'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment',
},
'skip': 'Requires login',
+ },
+ {
+ 'url': 'http://gdcvault.com/play/1020791/',
+ 'only_matching': True,
}
]
@@ -64,27 +74,41 @@ class GDCVaultIE(InfoExtractor):
return video_formats
def _parse_flv(self, xml_description):
- video_formats = []
- akami_url = xml_description.find('./metadata/akamaiHost').text
+ formats = []
+ akamai_url = xml_description.find('./metadata/akamaiHost').text
+ audios = xml_description.find('./metadata/audios')
+ if audios is not None:
+ for audio in audios:
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(audio.get('url'), '.flv'),
+ 'ext': 'flv',
+ 'vcodec': 'none',
+ 'format_id': audio.get('code'),
+ })
slide_video_path = xml_description.find('./metadata/slideVideo').text
- video_formats.append({
- 'url': 'rtmp://' + akami_url + '/' + slide_video_path,
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(slide_video_path, '.flv'),
+ 'ext': 'flv',
'format_note': 'slide deck video',
'quality': -2,
'preference': -2,
'format_id': 'slides',
})
speaker_video_path = xml_description.find('./metadata/speakerVideo').text
- video_formats.append({
- 'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(speaker_video_path, '.flv'),
+ 'ext': 'flv',
'format_note': 'speaker video',
'quality': -1,
'preference': -1,
'format_id': 'speaker',
})
- return video_formats
+ return formats
- def _login(self, webpage_url, video_id):
+ def _login(self, webpage_url, display_id):
(username, password) = self._get_login_info()
if username is None or password is None:
self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
@@ -99,11 +123,11 @@ class GDCVaultIE(InfoExtractor):
'password': password,
}
- request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
+ request = sanitized_Request(login_url, compat_urllib_parse.urlencode(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- self._download_webpage(request, video_id, 'Logging in')
- start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
- self._download_webpage(logout_url, video_id, 'Logging out')
+ self._download_webpage(request, display_id, 'Logging in')
+ start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page')
+ self._download_webpage(logout_url, display_id, 'Logging out')
return start_page
@@ -111,22 +135,27 @@ class GDCVaultIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ display_id = mobj.group('name') or video_id
+
webpage_url = 'http://www.gdcvault.com/play/' + video_id
- start_page = self._download_webpage(webpage_url, video_id)
+ start_page = self._download_webpage(webpage_url, display_id)
direct_url = self._search_regex(
r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);',
start_page, 'url', default=None)
if direct_url:
- video_url = 'http://www.gdcvault.com/' + direct_url
title = self._html_search_regex(
r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>',
start_page, 'title')
+ video_url = 'http://www.gdcvault.com' + direct_url
+ # resolve the url so that we can detect the correct extension
+ head = self._request_webpage(HEADRequest(video_url), video_id)
+ video_url = head.geturl()
return {
'id': video_id,
+ 'display_id': display_id,
'url': video_url,
- 'ext': 'flv',
'title': title,
}
@@ -135,7 +164,7 @@ class GDCVaultIE(InfoExtractor):
start_page, 'xml root', default=None)
if xml_root is None:
# Probably need to authenticate
- login_res = self._login(webpage_url, video_id)
+ login_res = self._login(webpage_url, display_id)
if login_res is None:
self.report_warning('Could not login.')
else:
@@ -152,8 +181,8 @@ class GDCVaultIE(InfoExtractor):
# Fallback to the older format
xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
- xml_decription_url = xml_root + 'xml/' + xml_name
- xml_description = self._download_xml(xml_decription_url, video_id)
+ xml_description_url = xml_root + 'xml/' + xml_name
+ xml_description = self._download_xml(xml_description_url, display_id)
video_title = xml_description.find('./metadata/title').text
video_formats = self._parse_mp4(xml_description)
@@ -162,6 +191,7 @@ class GDCVaultIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'title': video_title,
'formats': video_formats,
}
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index f4500e931..5075d131e 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
import os
import re
+import sys
from .common import InfoExtractor
from .youtube import YoutubeIE
from ..compat import (
- compat_urllib_parse,
+ compat_etree_fromstring,
+ compat_urllib_parse_unquote,
compat_urlparse,
compat_xml_parse_error,
)
@@ -19,19 +21,39 @@ from ..utils import (
HEADRequest,
is_html,
orderedSet,
- parse_xml,
+ sanitized_Request,
smuggle_url,
unescapeHTML,
unified_strdate,
unsmuggle_url,
UnsupportedError,
url_basename,
+ xpath_text,
)
-from .brightcove import BrightcoveIE
+from .brightcove import (
+ BrightcoveLegacyIE,
+ BrightcoveNewIE,
+)
+from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
+from .tvc import TVCIE
+from .sportbox import SportBoxEmbedIE
from .smotri import SmotriIE
+from .myvi import MyviIE
from .condenast import CondeNastIE
+from .udn import UDNEmbedIE
+from .senateisvp import SenateISVPIE
+from .bliptv import BlipTVIE
+from .svt import SVTIE
+from .pornhub import PornHubIE
+from .xhamster import XHamsterEmbedIE
+from .vimeo import VimeoIE
+from .dailymotion import DailymotionCloudIE
+from .onionstudios import OnionStudiosIE
+from .snagfilms import SnagFilmsEmbedIE
+from .screenwavemedia import ScreenwaveMediaIE
+from .mtv import MTVServicesEmbeddedIE
class GenericIE(InfoExtractor):
@@ -39,6 +61,197 @@ class GenericIE(InfoExtractor):
_VALID_URL = r'.*'
IE_NAME = 'generic'
_TESTS = [
+ # Direct link to a video
+ {
+ 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+ 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+ 'info_dict': {
+ 'id': 'trailer',
+ 'ext': 'mp4',
+ 'title': 'trailer',
+ 'upload_date': '20100513',
+ }
+ },
+ # Direct link to media delivered compressed (until Accept-Encoding is *)
+ {
+ 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+ 'md5': '128c42e68b13950268b648275386fc74',
+ 'info_dict': {
+ 'id': 'FictionJunction-Parallel_Hearts',
+ 'ext': 'flac',
+ 'title': 'FictionJunction-Parallel_Hearts',
+ 'upload_date': '20140522',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # Direct download with broken HEAD
+ {
+ 'url': 'http://ai-radio.org:8000/radio.opus',
+ 'info_dict': {
+ 'id': 'radio',
+ 'ext': 'opus',
+ 'title': 'radio',
+ },
+ 'params': {
+ 'skip_download': True, # infinite live stream
+ },
+ 'expected_warnings': [
+ r'501.*Not Implemented'
+ ],
+ },
+ # Direct link with incorrect MIME type
+ {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'md5': '4ccbebe5f36706d85221f204d7eb5913',
+ 'info_dict': {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'id': '5_Lennart_Poettering_-_Systemd',
+ 'ext': 'webm',
+ 'title': '5_Lennart_Poettering_-_Systemd',
+ 'upload_date': '20141120',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # RSS feed
+ {
+ 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'info_dict': {
+ 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+ 'title': 'Zero Punctuation',
+ 'description': 're:.*groundbreaking video review series.*'
+ },
+ 'playlist_mincount': 11,
+ },
+ # RSS feed with enclosure
+ {
+ 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'info_dict': {
+ 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ 'ext': 'm4v',
+ 'upload_date': '20150228',
+ 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+ }
+ },
+ # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
+ {
+ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
+ 'info_dict': {
+ 'id': 'smil',
+ 'ext': 'mp4',
+ 'title': 'Automatics, robotics and biocybernetics',
+ 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'upload_date': '20130627',
+ 'formats': 'mincount:16',
+ 'subtitles': 'mincount:1',
+ },
+ 'params': {
+ 'force_generic_extractor': True,
+ 'skip_download': True,
+ },
+ },
+ # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
+ {
+ 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
+ 'info_dict': {
+ 'id': 'hds',
+ 'ext': 'flv',
+ 'title': 'hds',
+ 'formats': 'mincount:1',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # SMIL from https://www.restudy.dk/video/play/id/1637
+ {
+ 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
+ 'info_dict': {
+ 'id': 'video_1637',
+ 'ext': 'flv',
+ 'title': 'video_1637',
+ 'formats': 'mincount:3',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
+ {
+ 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
+ 'info_dict': {
+ 'id': 'smil-service',
+ 'ext': 'flv',
+ 'title': 'smil-service',
+ 'formats': 'mincount:1',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
+ {
+ 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
+ 'info_dict': {
+ 'id': '4719370',
+ 'ext': 'mp4',
+ 'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
+ 'formats': 'mincount:3',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html
+ {
+ 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf',
+ 'info_dict': {
+ 'id': 'mZlp2ctYIUEB',
+ 'ext': 'mp4',
+ 'title': 'Tikibad ontruimd wegens brand',
+ 'description': 'md5:05ca046ff47b931f9b04855015e163a4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 33,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # google redirect
+ {
+ 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+ 'info_dict': {
+ 'id': 'cmQHVoWB5FY',
+ 'ext': 'mp4',
+ 'upload_date': '20130224',
+ 'uploader_id': 'TheVerge',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
+ 'uploader': 'The Verge',
+ 'title': 'First Firefox OS phones side-by-side',
+ },
+ 'params': {
+ 'skip_download': False,
+ }
+ },
+ {
+ # redirect in Refresh HTTP header
+ 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1',
+ 'info_dict': {
+ 'id': 'pO8h3EaFRdo',
+ 'ext': 'mp4',
+ 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
+ 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5',
+ 'upload_date': '20150917',
+ 'uploader_id': 'brtvofficial',
+ 'uploader': 'Boiler Room',
+ },
+ 'params': {
+ 'skip_download': False,
+ },
+ },
{
'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
@@ -65,7 +278,7 @@ class GenericIE(InfoExtractor):
# it also tests brightcove videos that need to set the 'Referer' in the
# http requests
{
- 'add_ie': ['Brightcove'],
+ 'add_ie': ['BrightcoveLegacy'],
'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
'info_dict': {
'id': '2765128793001',
@@ -89,7 +302,7 @@ class GenericIE(InfoExtractor):
'uploader': 'thestar.com',
'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
},
- 'add_ie': ['Brightcove'],
+ 'add_ie': ['BrightcoveLegacy'],
},
{
'url': 'http://www.championat.com/video/football/v/87/87499.html',
@@ -104,7 +317,7 @@ class GenericIE(InfoExtractor):
},
{
# https://github.com/rg3/youtube-dl/issues/3541
- 'add_ie': ['Brightcove'],
+ 'add_ie': ['BrightcoveLegacy'],
'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
'info_dict': {
'id': '3866516442001',
@@ -118,17 +331,6 @@ class GenericIE(InfoExtractor):
'skip_download': True, # m3u8 download
},
},
- # Direct link to a video
- {
- 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
- 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
- 'info_dict': {
- 'id': 'trailer',
- 'ext': 'mp4',
- 'title': 'trailer',
- 'upload_date': '20100513',
- }
- },
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
@@ -140,6 +342,19 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Ooyala'],
},
+ {
+ # ooyala video embedded with http://player.ooyala.com/iframe.js
+ 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
+ 'info_dict': {
+ 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
+ 'ext': 'mp4',
+ 'title': '"Steve Jobs: Man in the Machine" trailer',
+ 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# multiple ooyala embeds on SBN network websites
{
'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@@ -153,22 +368,6 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Ooyala'],
},
- # google redirect
- {
- 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
- 'info_dict': {
- 'id': 'cmQHVoWB5FY',
- 'ext': 'mp4',
- 'upload_date': '20130224',
- 'uploader_id': 'TheVerge',
- 'description': 're:^Chris Ziegler takes a look at the\.*',
- 'uploader': 'The Verge',
- 'title': 'First Firefox OS phones side-by-side',
- },
- 'params': {
- 'skip_download': False,
- }
- },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -196,14 +395,6 @@ class GenericIE(InfoExtractor):
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
},
},
- # BBC iPlayer embeds
- {
- 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
- 'info_dict': {
- 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
- },
- 'playlist_mincount': 18,
- },
# RUTV embed
{
'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
@@ -218,6 +409,66 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ # TVC embed
+ {
+ 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
+ 'info_dict': {
+ 'id': '55304',
+ 'ext': 'mp4',
+ 'title': 'Дошкольное воспитание',
+ },
+ },
+ # SportBox embed
+ {
+ 'url': 'http://www.vestifinance.ru/articles/25753',
+ 'info_dict': {
+ 'id': '25753',
+ 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '370908',
+ 'title': 'Госзаказ. День 3',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'info_dict': {
+ 'id': '370905',
+ 'title': 'Госзаказ. День 2',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'info_dict': {
+ 'id': '370902',
+ 'title': 'Госзаказ. День 1',
+ 'ext': 'mp4',
+ }
+ }],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ # Myvi.ru embed
+ {
+ 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1',
+ 'info_dict': {
+ 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e',
+ 'ext': 'mp4',
+ 'title': 'Ужастики, русский трейлер (2015)',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 153,
+ }
+ },
+ # XHamster embed
+ {
+ 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
+ 'info_dict': {
+ 'id': 'showthread',
+ 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
+ },
+ 'playlist_mincount': 7,
+ },
# Embedded TED video
{
'url': 'http://en.support.wordpress.com/videos/ted-talks/',
@@ -267,6 +518,26 @@ class GenericIE(InfoExtractor):
'skip_download': 'Requires rtmpdump'
}
},
+ # francetv embed
+ {
+ 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero',
+ 'info_dict': {
+ 'id': 'EV_30231',
+ 'ext': 'mp4',
+ 'title': 'Alcaline, le concert avec Calogero',
+ 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+ 'upload_date': '20150226',
+ 'timestamp': 1424989860,
+ 'duration': 5400,
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ 'expected_warnings': [
+ 'Forbidden'
+ ]
+ },
# Condé Nast embed
{
'url': 'http://www.wired.com/2014/04/honda-asimo/',
@@ -369,16 +640,6 @@ class GenericIE(InfoExtractor):
'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
}
},
- # RSS feed
- {
- 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'info_dict': {
- 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
- 'title': 'Zero Punctuation',
- 'description': 're:.*groundbreaking video review series.*'
- },
- 'playlist_mincount': 11,
- },
# Multiple brightcove videos
# https://github.com/rg3/youtube-dl/issues/2283
{
@@ -432,21 +693,6 @@ class GenericIE(InfoExtractor):
'uploader': 'thoughtworks.wistia.com',
},
},
- # Direct download with broken HEAD
- {
- 'url': 'http://ai-radio.org:8000/radio.opus',
- 'info_dict': {
- 'id': 'radio',
- 'ext': 'opus',
- 'title': 'radio',
- },
- 'params': {
- 'skip_download': True, # infinite live stream
- },
- 'expected_warnings': [
- r'501.*Not Implemented'
- ],
- },
# Soundcloud embed
{
'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
@@ -473,25 +719,11 @@ class GenericIE(InfoExtractor):
{
'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
'info_dict': {
+ 'id': '1986',
'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
},
'playlist_mincount': 2,
},
- # Direct link with incorrect MIME type
- {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'md5': '4ccbebe5f36706d85221f204d7eb5913',
- 'info_dict': {
- 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
- 'id': '5_Lennart_Poettering_-_Systemd',
- 'ext': 'webm',
- 'title': '5_Lennart_Poettering_-_Systemd',
- 'upload_date': '20141120',
- },
- 'expected_warnings': [
- 'URL could be a direct video link, returning it as such.'
- ]
- },
# Cinchcast embed
{
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
@@ -525,18 +757,321 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Viddler'],
},
+ # Libsyn embed
+ {
+ 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
+ 'info_dict': {
+ 'id': '3377616',
+ 'ext': 'mp3',
+ 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
+ 'description': 'md5:601cb790edd05908957dae8aaa866465',
+ 'upload_date': '20150220',
+ },
+ },
# jwplayer YouTube
{
'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
'info_dict': {
'id': 'Mrj4DVp2zeA',
'ext': 'mp4',
- 'upload_date': '20150204',
+ 'upload_date': '20150212',
'uploader': 'The National Archives UK',
'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
'uploader_id': 'NationalArchives08',
'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
},
+ },
+ # rtl.nl embed
+ {
+ 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'aanslagen-kopenhagen',
+ 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
+ }
+ },
+ # Zapiks embed
+ {
+ 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
+ 'info_dict': {
+ 'id': '118046',
+ 'ext': 'mp4',
+ 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
+ }
+ },
+ # Kaltura embed
+ {
+ 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
+ 'info_dict': {
+ 'id': '1_eergr3h1',
+ 'ext': 'mp4',
+ 'upload_date': '20150226',
+ 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
+ 'timestamp': int,
+ 'title': 'John Carlson Postgame 2/25/15',
+ },
+ },
+ # Kaltura embed (different embed code)
+ {
+ 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
+ 'info_dict': {
+ 'id': '1_a52wc67y',
+ 'ext': 'flv',
+ 'upload_date': '20150127',
+ 'uploader_id': 'PremierMedia',
+ 'timestamp': int,
+ 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
+ },
+ },
+ # Kaltura embed protected with referrer
+ {
+ 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero',
+ 'info_dict': {
+ 'id': '1_g4fbemnq',
+ 'ext': 'mp4',
+ 'title': 'Violetta - Achter De Schermen - Ruggero',
+ 'description': 'Achter de schermen met Ruggero',
+ 'timestamp': 1435133761,
+ 'upload_date': '20150624',
+ 'uploader_id': 'echojecka',
+ },
+ },
+ # Eagle.Platform embed (generic URL)
+ {
+ 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+ 'info_dict': {
+ 'id': '227304',
+ 'ext': 'mp4',
+ 'title': 'Навальный вышел на свободу',
+ 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 87,
+ 'view_count': int,
+ 'age_limit': 0,
+ },
+ },
+ # ClipYou (Eagle.Platform) embed (custom URL)
+ {
+ 'url': 'http://muz-tv.ru/play/7129/',
+ 'info_dict': {
+ 'id': '12820',
+ 'ext': 'mp4',
+ 'title': "'O Sole Mio",
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 216,
+ 'view_count': int,
+ },
+ },
+ # Pladform embed
+ {
+ 'url': 'http://muz-tv.ru/kinozal/view/7400/',
+ 'info_dict': {
+ 'id': '100183293',
+ 'ext': 'mp4',
+ 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
+ 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 694,
+ 'age_limit': 0,
+ },
+ },
+ # Playwire embed
+ {
+ 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
+ 'info_dict': {
+ 'id': '3519514',
+ 'ext': 'mp4',
+ 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'duration': 45.115,
+ },
+ },
+ # 5min embed
+ {
+ 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
+ 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
+ 'info_dict': {
+ 'id': '518726732',
+ 'ext': 'mp4',
+ 'title': 'Facebook Creates "On This Day" | Crunch Report',
+ },
+ },
+ # SVT embed
+ {
+ 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
+ 'info_dict': {
+ 'id': '2900353',
+ 'ext': 'flv',
+ 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+ 'duration': 27,
+ 'age_limit': 0,
+ },
+ },
+ # Crooks and Liars embed
+ {
+ 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
+ 'info_dict': {
+ 'id': '8RUoRhRi',
+ 'ext': 'mp4',
+ 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
+ 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
+ 'timestamp': 1428207000,
+ 'upload_date': '20150405',
+ 'uploader': 'Heather',
+ },
+ },
+ # Crooks and Liars external embed
+ {
+ 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
+ 'info_dict': {
+ 'id': 'MTE3MjUtMzQ2MzA',
+ 'ext': 'mp4',
+ 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
+ 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
+ 'timestamp': 1265032391,
+ 'upload_date': '20100201',
+ 'uploader': 'Heather',
+ },
+ },
+ # NBC Sports vplayer embed
+ {
+ 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
+ 'info_dict': {
+ 'id': 'ln7x1qSThw4k',
+ 'ext': 'flv',
+ 'title': "PFT Live: New leader in the 'new-look' defense",
+ 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+ },
+ },
+ # UDN embed
+ {
+ 'url': 'http://www.udn.com/news/story/7314/822787',
+ 'md5': 'fd2060e988c326991037b9aff9df21a6',
+ 'info_dict': {
+ 'id': '300346',
+ 'ext': 'mp4',
+ 'title': '中一中男師變性 全校師生力挺',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ },
+ # Ooyala embed
+ {
+ 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
+ 'info_dict': {
+ 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
+ 'ext': 'mp4',
+ 'description': 'VIDEO: Index/Match versus VLOOKUP.',
+ 'title': 'This is what separates the Excel masters from the wannabes',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ }
+ },
+ # Contains a SMIL manifest
+ {
+ 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
+ 'info_dict': {
+ 'id': 'file',
+ 'ext': 'flv',
+ 'title': '+ Football: Lottery Champions League Europe',
+ 'uploader': 'www.telewebion.com',
+ },
+ 'params': {
+ # rtmpe downloads
+ 'skip_download': True,
+ }
+ },
+ # Brightcove URL in single quotes
+ {
+ 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
+ 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
+ 'info_dict': {
+ 'id': '4255764656001',
+ 'ext': 'mp4',
+ 'title': 'SN Presents: Russell Martin, World Citizen',
+ 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
+ 'uploader': 'Rogers Sportsnet',
+ },
+ },
+ # Dailymotion Cloud video
+ {
+ 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
+ 'md5': '49444254273501a64675a7e68c502681',
+ 'info_dict': {
+ 'id': '5585de919473990de4bee11b',
+ 'ext': 'mp4',
+ 'title': 'Le débat',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ }
+ },
+ # OnionStudios embed
+ {
+ 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
+ 'info_dict': {
+ 'id': '2855',
+ 'ext': 'mp4',
+ 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ 'uploader': 'ClickHole',
+ 'uploader_id': 'clickhole',
+ }
+ },
+ # SnagFilms embed
+ {
+ 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
+ 'info_dict': {
+ 'id': '74849a00-85a9-11e1-9660-123139220831',
+ 'ext': 'mp4',
+ 'title': '#whilewewatch',
+ }
+ },
+ # AdobeTVVideo embed
+ {
+ 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
+ 'md5': '43662b577c018ad707a63766462b1e87',
+ 'info_dict': {
+ 'id': '2456',
+ 'ext': 'mp4',
+ 'title': 'New experience with Acrobat DC',
+ 'description': 'New experience with Acrobat DC',
+ 'duration': 248.667,
+ },
+ },
+ # ScreenwaveMedia embed
+ {
+ 'url': 'http://www.thecinemasnob.com/the-cinema-snob/a-nightmare-on-elm-street-2-freddys-revenge1',
+ 'md5': '24ace5baba0d35d55c6810b51f34e9e0',
+ 'info_dict': {
+ 'id': 'cinemasnob-55d26273809dd',
+ 'ext': 'mp4',
+ 'title': 'cinemasnob',
+ },
+ },
+ # BrightcoveInPageEmbed embed
+ {
+ 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
+ 'info_dict': {
+ 'id': '4238694884001',
+ 'ext': 'flv',
+ 'title': 'Tabletop: Dread, Last Thoughts',
+ 'description': 'Tabletop: Dread, Last Thoughts',
+ 'duration': 51690,
+ },
+ },
+ # JWPlayer with M3U8
+ {
+ 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video',
+ 'info_dict': {
+ 'id': 'playlist',
+ 'ext': 'mp4',
+ 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ',
+ 'uploader': 'ren.tv',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ }
}
]
@@ -549,11 +1084,24 @@ class GenericIE(InfoExtractor):
playlist_desc_el = doc.find('./channel/description')
playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
- entries = [{
- '_type': 'url',
- 'url': e.find('link').text,
- 'title': e.find('title').text,
- } for e in doc.findall('./channel/item')]
+ entries = []
+ for it in doc.findall('./channel/item'):
+ next_url = xpath_text(it, 'link', fatal=False)
+ if not next_url:
+ enclosure_nodes = it.findall('./enclosure')
+ for e in enclosure_nodes:
+ next_url = e.attrib.get('url')
+ if next_url:
+ break
+
+ if not next_url:
+ continue
+
+ entries.append({
+ '_type': 'url',
+ 'url': next_url,
+ 'title': it.find('title').text,
+ })
return {
'_type': 'playlist',
@@ -645,7 +1193,7 @@ class GenericIE(InfoExtractor):
force_videoid = smuggled_data['force_videoid']
video_id = force_videoid
else:
- video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
+ video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
self.to_screen('%s: Requesting header' % video_id)
@@ -667,7 +1215,9 @@ class GenericIE(InfoExtractor):
full_response = None
if head_response is False:
- full_response = self._request_webpage(url, video_id)
+ request = sanitized_Request(url)
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
head_response = full_response
# Check for direct link to a video
@@ -678,7 +1228,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'formats': [{
'format_id': m.group('format_id'),
@@ -689,10 +1239,22 @@ class GenericIE(InfoExtractor):
}
if not self._downloader.params.get('test', False) and not is_intentional:
- self._downloader.report_warning('Falling back on generic information extractor.')
+ force = self._downloader.params.get('force_generic_extractor', False)
+ self._downloader.report_warning(
+ '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
if not full_response:
- full_response = self._request_webpage(url, video_id)
+ request = sanitized_Request(url)
+ # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+ # making it impossible to download only chunk of the file (yet we need only 512kB to
+ # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
+ # that will always result in downloading the whole file that is not desirable.
+ # Therefore for extraction pass we have to override Accept-Encoding to any in order
+ # to accept raw bytes and being able to download only a chunk.
+ # It may probably better to solve this by checking Content-Type for application/octet-stream
+ # after HEAD request finishes, but not sure if we can rely on this.
+ request.add_header('Accept-Encoding', '*')
+ full_response = self._request_webpage(request, video_id)
# Maybe it's a direct link to a video?
# Be careful not to download the whole thing!
@@ -704,7 +1266,7 @@ class GenericIE(InfoExtractor):
head_response.headers.get('Last-Modified'))
return {
'id': video_id,
- 'title': os.path.splitext(url_basename(url))[0],
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'direct': True,
'url': url,
'upload_date': upload_date,
@@ -715,11 +1277,15 @@ class GenericIE(InfoExtractor):
self.report_extraction(video_id)
- # Is it an RSS feed?
+ # Is it an RSS feed, a SMIL file or a XSPF playlist?
try:
- doc = parse_xml(webpage)
+ doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
+ elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
+ return self._parse_smil(doc, url, video_id)
+ elif doc.tag == '{http://xspf.org/ns/0/}playlist':
+ return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
except compat_xml_parse_error:
pass
@@ -731,7 +1297,7 @@ class GenericIE(InfoExtractor):
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/rg3/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
- webpage = compat_urllib_parse.unquote(webpage)
+ webpage = compat_urllib_parse_unquote(webpage)
# it's tempting to parse this further, but you would
# have to take into account all the variations like
@@ -765,14 +1331,14 @@ class GenericIE(InfoExtractor):
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
- # Look for BrightCove:
- bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
+ # Look for Brightcove Legacy Studio embeds
+ bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
self.to_screen('Brightcove video detected.')
entries = [{
'_type': 'url',
'url': smuggle_url(bc_url, {'Referer': url}),
- 'ie_key': 'Brightcove'
+ 'ie_key': 'BrightcoveLegacy'
} for bc_url in bc_urls]
return {
@@ -782,19 +1348,27 @@ class GenericIE(InfoExtractor):
'entries': entries,
}
- # Look for embedded (iframe) Vimeo player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
- if mobj:
- player_url = unescapeHTML(mobj.group('url'))
- surl = smuggle_url(player_url, {'Referer': url})
- return self.url_result(surl)
+ # Look for Brightcove New Studio embeds
+ bc_urls = BrightcoveNewIE._extract_urls(webpage)
+ if bc_urls:
+ return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
- # Look for embedded (swf embed) Vimeo player
- mobj = re.search(
- r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
- if mobj:
- return self.url_result(mobj.group(1))
+ # Look for embedded rtl.nl player
+ matches = re.findall(
+ r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
+ webpage)
+ if matches:
+ return _playlist_from_matches(matches, ie='RtlNl')
+
+ vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
+ if vimeo_url is not None:
+ return self.url_result(vimeo_url)
+
+ vid_me_embed_url = self._search_regex(
+ r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+ webpage, 'vid.me embed', default=None)
+ if vid_me_embed_url is not None:
+ return self.url_result(vid_me_embed_url, 'Vidme')
# Look for embedded YouTube player
matches = re.findall(r'''(?x)
@@ -863,12 +1437,14 @@ class GenericIE(InfoExtractor):
}
# Look for embedded blip.tv player
- mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
- if mobj:
- return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
- mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
- if mobj:
- return self.url_result(mobj.group(1), 'BlipTV')
+ bliptv_url = BlipTVIE._extract_url(webpage)
+ if bliptv_url:
+ return self.url_result(bliptv_url, 'BlipTV')
+
+ # Look for SVT player
+ svt_url = SVTIE._extract_url(webpage)
+ if svt_url:
+ return self.url_result(svt_url, 'SVT')
# Look for embedded condenast player
matches = re.findall(
@@ -906,10 +1482,24 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'))
+ # Look for NYTimes player
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
+ # Look for Libsyn player
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
# Look for Ooyala videos
- mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+ mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
- re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
+ re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
+ re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
return OoyalaIE._build_url_result(mobj.group('ec'))
@@ -972,7 +1562,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
if mobj is not None:
- return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+ return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
# Look for funnyordie embed
matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
@@ -990,6 +1580,32 @@ class GenericIE(InfoExtractor):
if rutv_url:
return self.url_result(rutv_url, 'RUTV')
+ # Look for embedded TVC player
+ tvc_url = TVCIE._extract_url(webpage)
+ if tvc_url:
+ return self.url_result(tvc_url, 'TVC')
+
+ # Look for embedded SportBox player
+ sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
+ if sportbox_urls:
+ return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
+
+ # Look for embedded PornHub player
+ pornhub_url = PornHubIE._extract_url(webpage)
+ if pornhub_url:
+ return self.url_result(pornhub_url, 'PornHub')
+
+ # Look for embedded XHamster player
+ xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
+ if xhamster_urls:
+ return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+
+ # Look for embedded Tvigle player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Tvigle')
+
# Look for embedded TED player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
@@ -1009,11 +1625,23 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+ # Look for embedded francetv player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
# Look for embedded smotri.com player
smotri_url = SmotriIE._extract_url(webpage)
if smotri_url:
return self.url_result(smotri_url, 'Smotri')
+ # Look for embedded Myvi.ru player
+ myvi_url = MyviIE._extract_url(webpage)
+ if myvi_url:
+ return self.url_result(myvi_url)
+
# Look for embeded soundcloud player
mobj = re.search(
r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
@@ -1031,12 +1659,9 @@ class GenericIE(InfoExtractor):
return self.url_result(url, ie='Vulture')
# Look for embedded mtvservices player
- mobj = re.search(
- r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
- webpage)
- if mobj is not None:
- url = unescapeHTML(mobj.group('url'))
- return self.url_result(url, ie='MTVServicesEmbedded')
+ mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
+ if mtvservices_url:
+ return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
# Look for embedded yahoo player
mobj = re.search(
@@ -1067,11 +1692,15 @@ class GenericIE(InfoExtractor):
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage)
+ if not mobj:
+ mobj = re.search(
+ r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
+ webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
+ r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
webpage)
if mobj is not None:
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
@@ -1082,6 +1711,102 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'Livestream')
+ # Look for Zapiks embed
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Zapiks')
+
+ # Look for Kaltura embeds
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
+ if mobj is not None:
+ return self.url_result(smuggle_url(
+ 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(),
+ {'source_url': url}), 'Kaltura')
+
+ # Look for Eagle.Platform embeds
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'EaglePlatform')
+
+ # Look for ClipYou (uses Eagle.Platform) embeds
+ mobj = re.search(
+ r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
+ if mobj is not None:
+ return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
+
+ # Look for Pladform embeds
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Pladform')
+
+ # Look for Playwire embeds
+ mobj = re.search(
+ r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
+ # Look for 5min embeds
+ mobj = re.search(
+ r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
+ if mobj is not None:
+ return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
+
+ # Look for Crooks and Liars embeds
+ mobj = re.search(
+ r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
+ # Look for NBC Sports VPlayer embeds
+ nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+ if nbc_sports_url:
+ return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+
+ # Look for UDN embeds
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
+ if mobj is not None:
+ return self.url_result(
+ compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
+
+ # Look for Senate ISVP iframe
+ senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+ if senate_isvp_url:
+ return self.url_result(senate_isvp_url, 'SenateISVP')
+
+ # Look for Dailymotion Cloud videos
+ dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
+ if dmcloud_url:
+ return self.url_result(dmcloud_url, 'DailymotionCloud')
+
+ # Look for OnionStudios embeds
+ onionstudios_url = OnionStudiosIE._extract_url(webpage)
+ if onionstudios_url:
+ return self.url_result(onionstudios_url)
+
+ # Look for SnagFilms embeds
+ snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
+ if snagfilms_url:
+ return self.url_result(snagfilms_url)
+
+ # Look for ScreenwaveMedia embeds
+ mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage)
+ if mobj is not None:
+ return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia')
+
+ # Look for AdobeTVVideo embeds
+ mobj = re.search(
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
+ webpage)
+ if mobj is not None:
+ return self.url_result(
+ self._proto_relative_url(unescapeHTML(mobj.group(1))),
+ 'AdobeTVVideo')
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
@@ -1110,7 +1835,7 @@ class GenericIE(InfoExtractor):
if not found:
# Broaden the findall a little bit: JWPlayer JS loader
found = filter_video(re.findall(
- r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
+ r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
if not found:
# Flow player
found = filter_video(re.findall(r'''(?xs)
@@ -1136,14 +1861,23 @@ class GenericIE(InfoExtractor):
found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
if not found:
# HTML5 video
- found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
+ found = re.findall(r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
if not found:
+ REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
found = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
- r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'?([^\'"]+)',
+ r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
webpage)
+ if not found:
+ # Look also in Refresh HTTP header
+ refresh_header = head_response.headers.get('Refresh')
+ if refresh_header:
+ # In python 2 response HTTP headers are bytestrings
+ if sys.version_info < (3, 0) and isinstance(refresh_header, str):
+ refresh_header = refresh_header.decode('iso-8859-1')
+ found = re.search(REDIRECT_REGEX, refresh_header)
if found:
- new_url = found.group(1)
+ new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
self.report_following_redirect(new_url)
return {
'_type': 'url',
@@ -1154,8 +1888,9 @@ class GenericIE(InfoExtractor):
entries = []
for video_url in found:
+ video_url = video_url.replace('\\/', '/')
video_url = compat_urlparse.urljoin(url, video_url)
- video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+ video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
# Sometimes, jwplayer extraction will result in a YouTube URL
if YoutubeIE.suitable(video_url):
@@ -1165,19 +1900,32 @@ class GenericIE(InfoExtractor):
# here's a fun little line of code for you:
video_id = os.path.splitext(video_id)[0]
- entries.append({
+ entry_info_dict = {
'id': video_id,
- 'url': video_url,
'uploader': video_uploader,
'title': video_title,
'age_limit': age_limit,
- })
+ }
+
+ ext = determine_ext(video_url)
+ if ext == 'smil':
+ entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)
+ elif ext == 'xspf':
+ return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+ elif ext == 'm3u8':
+ entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
+ else:
+ entry_info_dict['url'] = video_url
+
+ entries.append(entry_info_dict)
if len(entries) == 1:
return entries[0]
else:
for num, e in enumerate(entries, start=1):
- e['title'] = '%s (%d)' % (e['title'], num)
+ # 'url' results don't have a title
+ if e.get('title') is not None:
+ e['title'] = '%s (%d)' % (e['title'], num)
return {
'_type': 'playlist',
'entries': entries,
diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py
new file mode 100644
index 000000000..884700c52
--- /dev/null
+++ b/youtube_dl/extractor/gfycat.py
@@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ qualities,
+ ExtractorError,
+)
+
+
+class GfycatIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/)?(?P<id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
+ 'info_dict': {
+ 'id': 'DeadlyDecisiveGermanpinscher',
+ 'ext': 'mp4',
+ 'title': 'Ghost in the Shell',
+ 'timestamp': 1410656006,
+ 'upload_date': '20140914',
+ 'uploader': 'anonymous',
+ 'duration': 10.4,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'categories': list,
+ 'age_limit': 0,
+ }
+ }, {
+ 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa',
+ 'info_dict': {
+ 'id': 'JauntyTimelyAmazontreeboa',
+ 'ext': 'mp4',
+ 'title': 'JauntyTimelyAmazontreeboa',
+ 'timestamp': 1411720126,
+ 'upload_date': '20140926',
+ 'uploader': 'anonymous',
+ 'duration': 3.52,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'categories': list,
+ 'age_limit': 0,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ gfy = self._download_json(
+ 'http://gfycat.com/cajax/get/%s' % video_id,
+ video_id, 'Downloading video info')
+ if 'error' in gfy:
+ raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True)
+ gfy = gfy['gfyItem']
+
+ title = gfy.get('title') or gfy['gfyName']
+ description = gfy.get('description')
+ timestamp = int_or_none(gfy.get('createDate'))
+ uploader = gfy.get('userName')
+ view_count = int_or_none(gfy.get('views'))
+ like_count = int_or_none(gfy.get('likes'))
+ dislike_count = int_or_none(gfy.get('dislikes'))
+ age_limit = 18 if gfy.get('nsfw') == '1' else 0
+
+ width = int_or_none(gfy.get('width'))
+ height = int_or_none(gfy.get('height'))
+ fps = int_or_none(gfy.get('frameRate'))
+ num_frames = int_or_none(gfy.get('numFrames'))
+
+ duration = float_or_none(num_frames, fps) if num_frames and fps else None
+
+ categories = gfy.get('tags') or gfy.get('extraLemmas') or []
+
+ FORMATS = ('gif', 'webm', 'mp4')
+ quality = qualities(FORMATS)
+
+ formats = []
+ for format_id in FORMATS:
+ video_url = gfy.get('%sUrl' % format_id)
+ if not video_url:
+ continue
+ filesize = gfy.get('%sSize' % format_id)
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'width': width,
+ 'height': height,
+ 'fps': fps,
+ 'filesize': filesize,
+ 'quality': quality(format_id),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'categories': categories,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py
index 775890112..28eb733e2 100644
--- a/youtube_dl/extractor/giga.py
+++ b/youtube_dl/extractor/giga.py
@@ -85,7 +85,8 @@ class GigaIE(InfoExtractor):
r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False)
view_count = str_to_int(self._search_regex(
- r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False))
+ r'<span class="views"><strong>([\d.,]+)</strong>',
+ webpage, 'view count', fatal=False))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py
index 29638a194..c65ef6bcf 100644
--- a/youtube_dl/extractor/globo.py
+++ b/youtube_dl/extractor/globo.py
@@ -13,79 +13,59 @@ from ..compat import (
from ..utils import (
ExtractorError,
float_or_none,
+ int_or_none,
+ str_or_none,
)
class GloboIE(InfoExtractor):
- _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)'
+ _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})'
_API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist'
- _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=2.9.9.50&resource_id=%s'
-
- _VIDEOID_REGEXES = [
- r'\bdata-video-id="(\d+)"',
- r'\bdata-player-videosids="(\d+)"',
- r'<div[^>]+\bid="(\d+)"',
- ]
+ _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s'
_RESIGN_EXPIRATION = 86400
- _TESTS = [
- {
- 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/',
- 'md5': '03ebf41cb7ade43581608b7d9b71fab0',
- 'info_dict': {
- 'id': '3654973',
- 'ext': 'mp4',
- 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão',
- 'duration': 251.585,
- 'uploader': 'SporTV',
- 'uploader_id': 698,
- 'like_count': int,
- }
+ _TESTS = [{
+ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
+ 'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
+ 'info_dict': {
+ 'id': '3607726',
+ 'ext': 'mp4',
+ 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
+ 'duration': 103.204,
+ 'uploader': 'Globo.com',
+ 'uploader_id': '265',
},
- {
- 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
- 'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
- 'info_dict': {
- 'id': '3607726',
- 'ext': 'mp4',
- 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
- 'duration': 103.204,
- 'uploader': 'Globo.com',
- 'uploader_id': 265,
- 'like_count': int,
- }
+ }, {
+ 'url': 'http://globoplay.globo.com/v/4581987/',
+ 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff',
+ 'info_dict': {
+ 'id': '4581987',
+ 'ext': 'mp4',
+ 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP',
+ 'duration': 137.973,
+ 'uploader': 'Rede Globo',
+ 'uploader_id': '196',
},
- {
- 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
- 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b',
- 'info_dict': {
- 'id': '3652183',
- 'ext': 'mp4',
- 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião',
- 'duration': 110.711,
- 'uploader': 'Rede Globo',
- 'uploader_id': 196,
- 'like_count': int,
- }
- },
- {
- 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
- 'md5': 'c1defca721ce25b2354e927d3e4b3dec',
- 'info_dict': {
- 'id': '3928201',
- 'ext': 'mp4',
- 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas',
- 'duration': 1472.906,
- 'uploader': 'Canal Brasil',
- 'uploader_id': 705,
- 'like_count': int,
- }
- },
- ]
-
- class MD5():
+ }, {
+ 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://globosatplay.globo.com/globonews/v/4472924/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html',
+ 'only_matching': True,
+ }]
+
+ class MD5:
HEX_FORMAT_LOWERCASE = 0
HEX_FORMAT_UPPERCASE = 1
BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = ''
@@ -352,23 +332,15 @@ class GloboIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id')
-
video = self._download_json(
self._API_URL_TEMPLATE % video_id, video_id)['videos'][0]
title = video['title']
- duration = float_or_none(video['duration'], 1000)
- like_count = video['likes']
- uploader = video['channel']
- uploader_id = video['channel_id']
formats = []
-
for resource in video['resources']:
resource_id = resource.get('_id')
- if not resource_id:
+ if not resource_id or resource_id.endswith('manifest'):
continue
security = self._download_json(
@@ -397,22 +369,70 @@ class GloboIE(InfoExtractor):
resource_url = resource['url']
signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash')
if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(signed_url, resource_id, 'mp4'))
+ m3u8_formats = self._extract_m3u8_formats(
+ signed_url, resource_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
else:
formats.append({
'url': signed_url,
- 'format_id': resource_id,
- 'height': resource.get('height'),
+ 'format_id': 'http-%s' % resource_id,
+ 'height': int_or_none(resource.get('height')),
})
self._sort_formats(formats)
+ duration = float_or_none(video.get('duration'), 1000)
+ uploader = video.get('channel')
+ uploader_id = str_or_none(video.get('channel_id'))
+
return {
'id': video_id,
'title': title,
'duration': duration,
'uploader': uploader,
'uploader_id': uploader_id,
- 'like_count': like_count,
'formats': formats
}
+
+
+class GloboArticleIE(InfoExtractor):
+ _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)\.html'
+
+ _VIDEOID_REGEXES = [
+ r'\bdata-video-id=["\'](\d{7,})',
+ r'\bdata-player-videosids=["\'](\d{7,})',
+ r'\bvideosIDs\s*:\s*["\'](\d{7,})',
+ r'\bdata-id=["\'](\d{7,})',
+ r'<div[^>]+\bid=["\'](\d{7,})',
+ ]
+
+ _TESTS = [{
+ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
+ 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b',
+ 'info_dict': {
+ 'id': '3652183',
+ 'ext': 'mp4',
+ 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião',
+ 'duration': 110.711,
+ 'uploader': 'Rede Globo',
+ 'uploader_id': '196',
+ }
+ }, {
+ 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id')
+ return self.url_result('globo:%s' % video_id, 'Globo')
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
index fcefe54cd..731bacd67 100644
--- a/youtube_dl/extractor/googleplus.py
+++ b/youtube_dl/extractor/googleplus.py
@@ -61,7 +61,7 @@ class GooglePlusIE(InfoExtractor):
'width': int(width),
'height': int(height),
} for width, height, video_url in re.findall(
- r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)]
+ r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent.com.*?)"', webpage)]
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/grooveshark.py b/youtube_dl/extractor/grooveshark.py
deleted file mode 100644
index 848d17beb..000000000
--- a/youtube_dl/extractor/grooveshark.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import time
-import math
-import os.path
-import re
-
-
-from .common import InfoExtractor
-from ..compat import (
- compat_html_parser,
- compat_urllib_parse,
- compat_urllib_request,
- compat_urlparse,
-)
-from ..utils import ExtractorError
-
-
-class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
- def __init__(self):
- self._current_object = None
- self.objects = []
- compat_html_parser.HTMLParser.__init__(self)
-
- def handle_starttag(self, tag, attrs):
- attrs = dict((k, v) for k, v in attrs)
- if tag == 'object':
- self._current_object = {'attrs': attrs, 'params': []}
- elif tag == 'param':
- self._current_object['params'].append(attrs)
-
- def handle_endtag(self, tag):
- if tag == 'object':
- self.objects.append(self._current_object)
- self._current_object = None
-
- @classmethod
- def extract_object_tags(cls, html):
- p = cls()
- p.feed(html)
- p.close()
- return p.objects
-
-
-class GroovesharkIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
- _TEST = {
- 'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
- 'md5': '7ecf8aefa59d6b2098517e1baa530023',
- 'info_dict': {
- 'id': '6SS1DW',
- 'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
- 'ext': 'mp3',
- 'duration': 227,
- }
- }
-
- do_playerpage_request = True
- do_bootstrap_request = True
-
- def _parse_target(self, target):
- uri = compat_urlparse.urlparse(target)
- hash = uri.fragment[1:].split('?')[0]
- token = os.path.basename(hash.rstrip('/'))
- return (uri, hash, token)
-
- def _build_bootstrap_url(self, target):
- (uri, hash, token) = self._parse_target(target)
- query = 'getCommunicationToken=1&hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
- return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
-
- def _build_meta_url(self, target):
- (uri, hash, token) = self._parse_target(target)
- query = 'hash=%s&%d' % (compat_urllib_parse.quote(hash, safe=''), self.ts)
- return (compat_urlparse.urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
-
- def _build_stream_url(self, meta):
- return compat_urlparse.urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None))
-
- def _build_swf_referer(self, target, obj):
- (uri, _, _) = self._parse_target(target)
- return compat_urlparse.urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None))
-
- def _transform_bootstrap(self, js):
- return re.split('(?m)^\s*try\s*\{', js)[0] \
- .split(' = ', 1)[1].strip().rstrip(';')
-
- def _transform_meta(self, js):
- return js.split('\n')[0].split('=')[1].rstrip(';')
-
- def _get_meta(self, target):
- (meta_url, token) = self._build_meta_url(target)
- self.to_screen('Metadata URL: %s' % meta_url)
-
- headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
- req = compat_urllib_request.Request(meta_url, headers=headers)
- res = self._download_json(req, token,
- transform_source=self._transform_meta)
-
- if 'getStreamKeyWithSong' not in res:
- raise ExtractorError(
- 'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
-
- if res['getStreamKeyWithSong'] is None:
- raise ExtractorError(
- 'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
- expected=True)
-
- return res['getStreamKeyWithSong']
-
- def _get_bootstrap(self, target):
- (bootstrap_url, token) = self._build_bootstrap_url(target)
-
- headers = {'Referer': compat_urlparse.urldefrag(target)[0]}
- req = compat_urllib_request.Request(bootstrap_url, headers=headers)
- res = self._download_json(req, token, fatal=False,
- note='Downloading player bootstrap data',
- errnote='Unable to download player bootstrap data',
- transform_source=self._transform_bootstrap)
- return res
-
- def _get_playerpage(self, target):
- (_, _, token) = self._parse_target(target)
-
- webpage = self._download_webpage(
- target, token,
- note='Downloading player page',
- errnote='Unable to download player page',
- fatal=False)
-
- if webpage is not None:
- # Search (for example German) error message
- error_msg = self._html_search_regex(
- r'<div id="content">\s*<h2>(.*?)</h2>', webpage,
- 'error message', default=None)
- if error_msg is not None:
- error_msg = error_msg.replace('\n', ' ')
- raise ExtractorError('Grooveshark said: %s' % error_msg)
-
- if webpage is not None:
- o = GroovesharkHtmlParser.extract_object_tags(webpage)
- return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'])
-
- return (webpage, None)
-
- def _real_initialize(self):
- self.ts = int(time.time() * 1000) # timestamp in millis
-
- def _real_extract(self, url):
- (target_uri, _, token) = self._parse_target(url)
-
- # 1. Fill cookiejar by making a request to the player page
- swf_referer = None
- if self.do_playerpage_request:
- (_, player_objs) = self._get_playerpage(url)
- if player_objs is not None:
- swf_referer = self._build_swf_referer(url, player_objs[0])
- self.to_screen('SWF Referer: %s' % swf_referer)
-
- # 2. Ask preload.php for swf bootstrap data to better mimic webapp
- if self.do_bootstrap_request:
- bootstrap = self._get_bootstrap(url)
- self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken'])
-
- # 3. Ask preload.php for track metadata.
- meta = self._get_meta(url)
-
- # 4. Construct stream request for track.
- stream_url = self._build_stream_url(meta)
- duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000))
- post_dict = {'streamKey': meta['streamKey']['streamKey']}
- post_data = compat_urllib_parse.urlencode(post_dict).encode('utf-8')
- headers = {
- 'Content-Length': len(post_data),
- 'Content-Type': 'application/x-www-form-urlencoded'
- }
- if swf_referer is not None:
- headers['Referer'] = swf_referer
-
- return {
- 'id': token,
- 'title': meta['song']['Name'],
- 'http_method': 'POST',
- 'url': stream_url,
- 'ext': 'mp3',
- 'format': 'mp3 audio',
- 'duration': duration,
- 'http_post_data': post_data,
- 'http_headers': headers,
- }
diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py
index a19b31ac0..7d8698655 100644
--- a/youtube_dl/extractor/hearthisat.py
+++ b/youtube_dl/extractor/hearthisat.py
@@ -4,12 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
HEADRequest,
+ sanitized_Request,
str_to_int,
urlencode_postdata,
urlhandle_detect_ext,
@@ -47,7 +45,7 @@ class HearThisAtIE(InfoExtractor):
r'intTrackId\s*=\s*(\d+)', webpage, 'track ID')
payload = urlencode_postdata({'tracks[]': track_id})
- req = compat_urllib_request.Request(self._PLAYLIST_URL, payload)
+ req = sanitized_Request(self._PLAYLIST_URL, payload)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
track = self._download_json(req, track_id, 'Downloading playlist')[0]
diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py
index 63d87b74c..f5aa73d18 100644
--- a/youtube_dl/extractor/hentaistigma.py
+++ b/youtube_dl/extractor/hentaistigma.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
- r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>',
+ r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
webpage, 'title')
wrap_url = self._html_search_regex(
- r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url')
+ r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
wrap_webpage = self._download_webpage(wrap_url, video_id)
video_url = self._html_search_regex(
- r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url')
+ r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/historicfilms.py b/youtube_dl/extractor/historicfilms.py
index 40afbe537..6a36933ac 100644
--- a/youtube_dl/extractor/historicfilms.py
+++ b/youtube_dl/extractor/historicfilms.py
@@ -25,7 +25,8 @@ class HistoricFilmsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
tape_id = self._search_regex(
- r'class="tapeId">([^<]+)<', webpage, 'tape id')
+ [r'class="tapeId"[^>]*>([^<]+)<', r'tapeId\s*:\s*"([^"]+)"'],
+ webpage, 'tape id')
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
diff --git a/youtube_dl/extractor/history.py b/youtube_dl/extractor/history.py
new file mode 100644
index 000000000..f86164afe
--- /dev/null
+++ b/youtube_dl/extractor/history.py
@@ -0,0 +1,31 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class HistoryIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?history\.com/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
+
+ _TESTS = [{
+ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
+ 'md5': '6fe632d033c92aa10b8d4a9be047a7c5',
+ 'info_dict': {
+ 'id': 'bLx5Dv5Aka1G',
+ 'ext': 'mp4',
+ 'title': "Bet You Didn't Know: Valentine's Day",
+ 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
+ },
+ 'add_ie': ['ThePlatform'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
+ webpage, 'video url')
+
+ return self.url_result(smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}))
diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py
index 84bd7c080..421f55bbe 100644
--- a/youtube_dl/extractor/hitbox.py
+++ b/youtube_dl/extractor/hitbox.py
@@ -10,6 +10,7 @@ from ..utils import (
float_or_none,
int_or_none,
compat_str,
+ determine_ext,
)
@@ -42,7 +43,8 @@ class HitboxIE(InfoExtractor):
def _extract_metadata(self, url, video_id):
thumb_base = 'https://edge.sf.hitbox.tv'
metadata = self._download_json(
- '%s/%s' % (url, video_id), video_id)
+ '%s/%s' % (url, video_id), video_id,
+ 'Downloading metadata JSON')
date = 'media_live_since'
media_type = 'livestream'
@@ -87,21 +89,41 @@ class HitboxIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- metadata = self._extract_metadata(
- 'https://www.hitbox.tv/api/media/video',
- video_id)
-
player_config = self._download_json(
'https://www.hitbox.tv/api/player/config/video/%s' % video_id,
- video_id)
+ video_id, 'Downloading video JSON')
- clip = player_config.get('clip')
- video_url = clip.get('url')
- res = clip.get('bitrates', [])[0].get('label')
+ formats = []
+ for video in player_config['clip']['bitrates']:
+ label = video.get('label')
+ if label == 'Auto':
+ continue
+ video_url = video.get('url')
+ if not video_url:
+ continue
+ bitrate = int_or_none(video.get('bitrate'))
+ if determine_ext(video_url) == 'm3u8':
+ if not video_url.startswith('http'):
+ continue
+ formats.append({
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'tbr': bitrate,
+ 'format_note': label,
+ 'protocol': 'm3u8_native',
+ })
+ else:
+ formats.append({
+ 'url': video_url,
+ 'tbr': bitrate,
+ 'format_note': label,
+ })
+ self._sort_formats(formats)
- metadata['resolution'] = res
- metadata['url'] = video_url
- metadata['protocol'] = 'm3u8'
+ metadata = self._extract_metadata(
+ 'https://www.hitbox.tv/api/media/video',
+ video_id)
+ metadata['formats'] = formats
return metadata
@@ -129,10 +151,6 @@ class HitboxLiveIE(HitboxIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- metadata = self._extract_metadata(
- 'https://www.hitbox.tv/api/media/live',
- video_id)
-
player_config = self._download_json(
'https://www.hitbox.tv/api/player/config/live/%s' % video_id,
video_id)
@@ -147,20 +165,39 @@ class HitboxLiveIE(HitboxIE):
servers.append(base_url)
for stream in cdn.get('bitrates'):
label = stream.get('label')
- if label != 'Auto':
+ if label == 'Auto':
+ continue
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ bitrate = int_or_none(stream.get('bitrate'))
+ if stream.get('provider') == 'hls' or determine_ext(stream_url) == 'm3u8':
+ if not stream_url.startswith('http'):
+ continue
+ formats.append({
+ 'url': stream_url,
+ 'ext': 'mp4',
+ 'tbr': bitrate,
+ 'format_note': label,
+ 'rtmp_live': True,
+ })
+ else:
formats.append({
- 'url': '%s/%s' % (base_url, stream.get('url')),
+ 'url': '%s/%s' % (base_url, stream_url),
'ext': 'mp4',
- 'vbr': stream.get('bitrate'),
- 'resolution': label,
+ 'tbr': bitrate,
'rtmp_live': True,
'format_note': host,
'page_url': url,
'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf',
})
-
self._sort_formats(formats)
+
+ metadata = self._extract_metadata(
+ 'https://www.hitbox.tv/api/media/live',
+ video_id)
metadata['formats'] = formats
metadata['is_live'] = True
metadata['title'] = self._live_title(metadata.get('title'))
+
return metadata
diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py
deleted file mode 100644
index 704d0285d..000000000
--- a/youtube_dl/extractor/hostingbulk.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
-from ..utils import (
- ExtractorError,
- int_or_none,
- urlencode_postdata,
-)
-
-
-class HostingBulkIE(InfoExtractor):
- _VALID_URL = r'''(?x)
- https?://(?:www\.)?hostingbulk\.com/
- (?:embed-)?(?P<id>[A-Za-z0-9]{12})(?:-\d+x\d+)?\.html'''
- _FILE_DELETED_REGEX = r'<b>File Not Found</b>'
- _TEST = {
- 'url': 'http://hostingbulk.com/n0ulw1hv20fm.html',
- 'md5': '6c8653c8ecf7ebfa83b76e24b7b2fe3f',
- 'info_dict': {
- 'id': 'n0ulw1hv20fm',
- 'ext': 'mp4',
- 'title': 'md5:5afeba33f48ec87219c269e054afd622',
- 'filesize': 6816081,
- 'thumbnail': 're:^http://.*\.jpg$',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://hostingbulk.com/{0:}.html'.format(video_id)
-
- # Custom request with cookie to set language to English, so our file
- # deleted regex would work.
- request = compat_urllib_request.Request(
- url, headers={'Cookie': 'lang=english'})
- webpage = self._download_webpage(request, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- title = self._html_search_regex(r'<h3>(.*?)</h3>', webpage, 'title')
- filesize = int_or_none(
- self._search_regex(
- r'<small>\((\d+)\sbytes?\)</small>',
- webpage,
- 'filesize',
- fatal=False
- )
- )
- thumbnail = self._search_regex(
- r'<img src="([^"]+)".+?class="pic"',
- webpage, 'thumbnail', fatal=False)
-
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- value="([^"]*)"
- ''', webpage))
-
- request = compat_urllib_request.Request(url, urlencode_postdata(fields))
- request.add_header('Content-type', 'application/x-www-form-urlencoded')
- response = self._request_webpage(request, video_id,
- 'Submiting download request')
- video_url = response.geturl()
-
- formats = [{
- 'format_id': 'sd',
- 'filesize': filesize,
- 'url': video_url,
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py
index 651784b73..31e219945 100644
--- a/youtube_dl/extractor/hotnewhiphop.py
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -3,13 +3,11 @@ from __future__ import unicode_literals
import base64
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
HEADRequest,
+ sanitized_Request,
)
@@ -41,7 +39,7 @@ class HotNewHipHopIE(InfoExtractor):
('mediaType', 's'),
('mediaId', video_id),
])
- r = compat_urllib_request.Request(
+ r = sanitized_Request(
'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata)
r.add_header('Content-Type', 'application/x-www-form-urlencoded')
mkd = self._download_json(
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py
index 3f7d6666c..16677f179 100644
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -1,8 +1,7 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import parse_iso8601
class HowcastIE(InfoExtractor):
@@ -13,29 +12,31 @@ class HowcastIE(InfoExtractor):
'info_dict': {
'id': '390161',
'ext': 'mp4',
- 'description': 'The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here\'s the proper way to tie a square knot.',
'title': 'How to Tie a Square Knot Properly',
- }
+ 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3',
+ 'timestamp': 1276081287,
+ 'upload_date': '20100609',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
+ video_id = self._match_id(url)
- video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- self.report_extraction(video_id)
-
- video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
- webpage, 'video URL')
-
- video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
- webpage, 'description', fatal=False)
+ embed_code = self._search_regex(
+ r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b',
+ webpage, 'ooyala embed code')
return {
+ '_type': 'url_transparent',
+ 'ie_key': 'Ooyala',
+ 'url': 'ooyala:%s' % embed_code,
'id': video_id,
- 'url': video_url,
- 'title': self._og_search_title(webpage),
- 'description': video_description,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'timestamp': parse_iso8601(self._html_search_meta(
+ 'article:published_time', webpage, 'timestamp')),
}
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py
index e97339121..663e6632a 100644
--- a/youtube_dl/extractor/howstuffworks.py
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -10,7 +10,7 @@ from ..utils import (
class HowStuffWorksIE(InfoExtractor):
- _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P<id>.+?)-video\.htm'
+ _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm'
_TESTS = [
{
'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm',
@@ -46,6 +46,10 @@ class HowStuffWorksIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
},
},
+ {
+ 'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py
index aa0724a02..cca3dd498 100644
--- a/youtube_dl/extractor/hypem.py
+++ b/youtube_dl/extractor/hypem.py
@@ -4,12 +4,10 @@ import json
import time
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
@@ -32,7 +30,7 @@ class HypemIE(InfoExtractor):
data = {'ax': 1, 'ts': time.time()}
data_encoded = compat_urllib_parse.urlencode(data)
complete_url = url + "?" + data_encoded
- request = compat_urllib_request.Request(complete_url)
+ request = sanitized_Request(complete_url)
response, urlh = self._download_webpage_handle(
request, track_id, 'Downloading webpage with the url')
cookie = urlh.headers.get('Set-Cookie', '')
@@ -52,7 +50,7 @@ class HypemIE(InfoExtractor):
title = track['song']
serve_url = "http://hypem.com/serve/source/%s/%s" % (track_id, key)
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
serve_url, '', {'Content-Type': 'application/json'})
request.add_header('cookie', cookie)
song_data = self._download_json(request, track_id, 'Downloading metadata')
diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py
index 370e86e5a..a39f422e9 100644
--- a/youtube_dl/extractor/iconosquare.py
+++ b/youtube_dl/extractor/iconosquare.py
@@ -1,36 +1,85 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ get_element_by_id,
+ remove_end,
+)
class IconosquareIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
_TEST = {
'url': 'http://statigr.am/p/522207370455279102_24101272',
'md5': '6eb93b882a3ded7c378ee1d6884b1814',
'info_dict': {
'id': '522207370455279102_24101272',
'ext': 'mp4',
- 'uploader_id': 'aguynamedpatrick',
'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
+ 'timestamp': 1376471991,
+ 'upload_date': '20130814',
+ 'uploader': 'aguynamedpatrick',
+ 'uploader_id': '24101272',
+ 'comment_count': int,
+ 'like_count': int,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
+
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<title>(.+?)(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)</title>',
- webpage, 'title')
- uploader_id = self._html_search_regex(
- r'@([^ ]+)', title, 'uploader name', fatal=False)
+
+ media = self._parse_json(
+ get_element_by_id('mediaJson', webpage),
+ video_id)
+
+ formats = [{
+ 'url': f['url'],
+ 'format_id': format_id,
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height'))
+ } for format_id, f in media['videos'].items()]
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' - via Iconosquare')
+
+ timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time'))
+ description = media.get('caption', {}).get('text')
+
+ uploader = media.get('user', {}).get('username')
+ uploader_id = media.get('user', {}).get('id')
+
+ comment_count = int_or_none(media.get('comments', {}).get('count'))
+ like_count = int_or_none(media.get('likes', {}).get('count'))
+
+ thumbnails = [{
+ 'url': t['url'],
+ 'id': thumbnail_id,
+ 'width': int_or_none(t.get('width')),
+ 'height': int_or_none(t.get('height'))
+ } for thumbnail_id, t in media.get('images', {}).items()]
+
+ comments = [{
+ 'id': comment.get('id'),
+ 'text': comment['text'],
+ 'timestamp': int_or_none(comment.get('created_time')),
+ 'author': comment.get('from', {}).get('full_name'),
+ 'author_id': comment.get('from', {}).get('username'),
+ } for comment in media.get('comments', {}).get('data', []) if 'text' in comment]
return {
'id': video_id,
- 'url': self._og_search_video_url(webpage),
'title': title,
- 'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'uploader_id': uploader_id
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'comment_count': comment_count,
+ 'like_count': like_count,
+ 'formats': formats,
+ 'comments': comments,
}
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 3db668cd0..bf2d2041b 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -34,6 +34,9 @@ class IGNIE(InfoExtractor):
},
{
'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
+ 'info_dict': {
+ 'id': '100-little-things-in-gta-5-that-will-blow-your-mind',
+ },
'playlist': [
{
'info_dict': {
@@ -58,7 +61,7 @@ class IGNIE(InfoExtractor):
},
{
'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
- 'md5': '4e9a0bda1e5eebd31ddcf86ec0b9b3c7',
+ 'md5': '618fedb9c901fd086f6f093564ef8558',
'info_dict': {
'id': '078fdd005f6d3c02f63d795faa1b984f',
'ext': 'mp4',
@@ -74,10 +77,10 @@ class IGNIE(InfoExtractor):
def _find_video_id(self, webpage):
res_id = [
r'"video_id"\s*:\s*"(.*?)"',
+ r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
r'data-video-id="(.+?)"',
r'<object id="vid_(.+?)"',
r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
- r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
]
return self._search_regex(res_id, webpage, 'video id')
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index f29df36b5..02e1e428e 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -4,8 +4,8 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urlparse,
+from ..utils import (
+ qualities,
)
@@ -30,24 +30,33 @@ class ImdbIE(InfoExtractor):
descr = self._html_search_regex(
r'(?s)<span itemprop="description">(.*?)</span>',
webpage, 'description', fatal=False)
- available_formats = re.findall(
- r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
- flags=re.MULTILINE)
+ player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
+ player_page = self._download_webpage(
+ player_url, video_id, 'Downloading player page')
+ # the player page contains the info for the default format, we have to
+ # fetch other pages for the rest of the formats
+ extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
+ format_pages = [
+ self._download_webpage(
+ f_url, video_id, 'Downloading info for %s format' % f_name)
+ for f_url, f_name in extra_formats]
+ format_pages.append(player_page)
+
+ quality = qualities(['SD', '480p', '720p'])
formats = []
- for f_id, f_path in available_formats:
- f_path = f_path.strip()
- format_page = self._download_webpage(
- compat_urlparse.urljoin(url, f_path),
- 'Downloading info for %s format' % f_id)
+ for format_page in format_pages:
json_data = self._search_regex(
r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
format_page, 'json data', flags=re.DOTALL)
info = json.loads(json_data)
format_info = info['videoPlayerObject']['video']
+ f_id = format_info['ffname']
formats.append({
'format_id': f_id,
- 'url': format_info['url'],
+ 'url': format_info['videoInfoList'][0]['videoUrl'],
+ 'quality': quality(f_id),
})
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py
new file mode 100644
index 000000000..70c8ca64e
--- /dev/null
+++ b/youtube_dl/extractor/imgur.py
@@ -0,0 +1,124 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ mimetype2ext,
+ ExtractorError,
+)
+
+
+class ImgurIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P<id>[a-zA-Z0-9]+)'
+
+ _TESTS = [{
+ 'url': 'https://i.imgur.com/A61SaA1.gifv',
+ 'info_dict': {
+ 'id': 'A61SaA1',
+ 'ext': 'mp4',
+ 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
+ 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$',
+ },
+ }, {
+ 'url': 'https://imgur.com/A61SaA1',
+ 'info_dict': {
+ 'id': 'A61SaA1',
+ 'ext': 'mp4',
+ 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
+ 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(
+ compat_urlparse.urljoin(url, video_id), video_id)
+
+ width = int_or_none(self._search_regex(
+ r'<param name="width" value="([0-9]+)"',
+ webpage, 'width', fatal=False))
+ height = int_or_none(self._search_regex(
+ r'<param name="height" value="([0-9]+)"',
+ webpage, 'height', fatal=False))
+
+ video_elements = self._search_regex(
+ r'(?s)<div class="video-elements">(.*?)</div>',
+ webpage, 'video elements', default=None)
+ if not video_elements:
+ raise ExtractorError(
+ 'No sources found for video %s. Maybe an image?' % video_id,
+ expected=True)
+
+ formats = []
+ for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
+ formats.append({
+ 'format_id': m.group('type').partition('/')[2],
+ 'url': self._proto_relative_url(m.group('src')),
+ 'ext': mimetype2ext(m.group('type')),
+ 'acodec': 'none',
+ 'width': width,
+ 'height': height,
+ 'http_headers': {
+ 'User-Agent': 'youtube-dl (like wget)',
+ },
+ })
+
+ gif_json = self._search_regex(
+ r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
+ webpage, 'GIF code', fatal=False)
+ if gif_json:
+ gifd = self._parse_json(
+ gif_json, video_id, transform_source=js_to_json)
+ formats.append({
+ 'format_id': 'gif',
+ 'preference': -10,
+ 'width': width,
+ 'height': height,
+ 'ext': 'gif',
+ 'acodec': 'none',
+ 'vcodec': 'gif',
+ 'container': 'gif',
+ 'url': self._proto_relative_url(gifd['gifUrl']),
+ 'filesize': gifd.get('size'),
+ 'http_headers': {
+ 'User-Agent': 'youtube-dl (like wget)',
+ },
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'description': self._og_search_description(webpage),
+ 'title': self._og_search_title(webpage),
+ }
+
+
+class ImgurAlbumIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P<id>[a-zA-Z0-9]+)'
+
+ _TEST = {
+ 'url': 'http://imgur.com/gallery/Q95ko',
+ 'info_dict': {
+ 'id': 'Q95ko',
+ },
+ 'playlist_count': 25,
+ }
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+
+ album_images = self._download_json(
+ 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id,
+ album_id)['data']['images']
+
+ entries = [
+ self.url_result('http://imgur.com/%s' % image['hash'])
+ for image in album_images if image.get('hash')]
+
+ return self.playlist_result(entries, album_id)
diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py
index 0847074ee..65712abc2 100644
--- a/youtube_dl/extractor/ina.py
+++ b/youtube_dl/extractor/ina.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class InaIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>I?[A-Z0-9]+)'
_TEST = {
'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py
new file mode 100644
index 000000000..12fb5e8e1
--- /dev/null
+++ b/youtube_dl/extractor/indavideo.py
@@ -0,0 +1,142 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_age_limit,
+ parse_iso8601,
+)
+
+
+class IndavideoEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)'
+ _TESTS = [{
+ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/',
+ 'md5': 'f79b009c66194acacd40712a6778acfa',
+ 'info_dict': {
+ 'id': '1837039',
+ 'ext': 'mp4',
+ 'title': 'Cicatánc',
+ 'description': '',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'cukiajanlo',
+ 'uploader_id': '83729',
+ 'timestamp': 1439193826,
+ 'upload_date': '20150810',
+ 'duration': 72,
+ 'age_limit': 0,
+ 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'],
+ },
+ }, {
+ 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id,
+ video_id)['data']
+
+ title = video['title']
+
+ video_urls = video.get('video_files', [])
+ video_file = video.get('video_file')
+ if video:
+ video_urls.append(video_file)
+ video_urls = list(set(video_urls))
+
+ video_prefix = video_urls[0].rsplit('/', 1)[0]
+
+ for flv_file in video.get('flv_files', []):
+ flv_url = '%s/%s' % (video_prefix, flv_file)
+ if flv_url not in video_urls:
+ video_urls.append(flv_url)
+
+ formats = [{
+ 'url': video_url,
+ 'height': self._search_regex(r'\.(\d{3,4})\.mp4$', video_url, 'height', default=None),
+ } for video_url in video_urls]
+ self._sort_formats(formats)
+
+ timestamp = video.get('date')
+ if timestamp:
+ # upload date is in CEST
+ timestamp = parse_iso8601(timestamp + ' +0200', ' ')
+
+ thumbnails = [{
+ 'url': self._proto_relative_url(thumbnail)
+ } for thumbnail in video.get('thumbnails', [])]
+
+ tags = [tag['title'] for tag in video.get('tags', [])]
+
+ return {
+ 'id': video.get('id') or video_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'thumbnails': thumbnails,
+ 'uploader': video.get('user_name'),
+ 'uploader_id': video.get('user_id'),
+ 'timestamp': timestamp,
+ 'duration': int_or_none(video.get('length')),
+ 'age_limit': parse_age_limit(video.get('age_limit')),
+ 'tags': tags,
+ 'formats': formats,
+ }
+
+
+class IndavideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P<id>[^/#?]+)'
+ _TESTS = [{
+ 'url': 'http://indavideo.hu/video/Vicces_cica_1',
+ 'md5': '8c82244ba85d2a2310275b318eb51eac',
+ 'info_dict': {
+ 'id': '1335611',
+ 'display_id': 'Vicces_cica_1',
+ 'ext': 'mp4',
+ 'title': 'Vicces cica',
+ 'description': 'Játszik a tablettel. :D',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Jet_Pack',
+ 'uploader_id': '491217',
+ 'timestamp': 1390821212,
+ 'upload_date': '20140127',
+ 'duration': 7,
+ 'age_limit': 0,
+ 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'],
+ },
+ }, {
+ 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+ embed_url = self._search_regex(
+ r'<link[^>]+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url')
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'IndavideoEmbed',
+ 'url': embed_url,
+ 'display_id': display_id,
+ }
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index f25f43664..71cfd12c5 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -4,14 +4,15 @@ import base64
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urlparse,
)
class InfoQIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
+ _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
'info_dict': {
@@ -20,7 +21,10 @@ class InfoQIE(InfoExtractor):
'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
'title': 'A Few of My Favorite [Python] Things',
},
- }
+ }, {
+ 'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -35,14 +39,14 @@ class InfoQIE(InfoExtractor):
# Extract video URL
encoded_id = self._search_regex(
r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id')
- real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
+ real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
playpath = 'mp4:' + real_id
video_filename = playpath.split('/')[-1]
video_id, extension = video_filename.split('.')
http_base = self._search_regex(
- r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage,
+ r'EXPRESSINSTALL_SWF\s*=\s*[^"]*"((?:https?:)?//[^/"]+/)', webpage,
'HTTP base URL')
formats = [{
@@ -52,7 +56,7 @@ class InfoQIE(InfoExtractor):
'play_path': playpath,
}, {
'format_id': 'http',
- 'url': http_base + real_id,
+ 'url': compat_urlparse.urljoin(url, http_base) + real_id,
}]
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index b020e2621..c158f2064 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -5,13 +5,14 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ limit_length,
)
class InstagramIE(InfoExtractor):
- _VALID_URL = r'http://instagram\.com/p/(?P<id>.*?)/'
- _TEST = {
- 'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
+ _VALID_URL = r'https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
'info_dict': {
'id': 'aye83DjauH',
@@ -20,11 +21,14 @@ class InstagramIE(InfoExtractor):
'title': 'Video by naomipq',
'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
}
- }
+ }, {
+ 'url': 'https://instagram.com/p/-Cmh1cukG2/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+
webpage = self._download_webpage(url, video_id)
uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
webpage, 'uploader id', fatal=False)
@@ -43,11 +47,11 @@ class InstagramIE(InfoExtractor):
class InstagramUserIE(InfoExtractor):
- _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+ _VALID_URL = r'https://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
IE_DESC = 'Instagram user profile'
IE_NAME = 'instagram:user'
_TEST = {
- 'url': 'http://instagram.com/porsche',
+ 'url': 'https://instagram.com/porsche',
'info_dict': {
'id': 'porsche',
'title': 'porsche',
@@ -102,11 +106,13 @@ class InstagramUserIE(InfoExtractor):
thumbnails_el = it.get('images', {})
thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
- title = it.get('caption', {}).get('text', it['id'])
+ # In some cases caption is null, which corresponds to None
+ # in python. As a result, it.get('caption', {}) gives None
+ title = (it.get('caption') or {}).get('text', it['id'])
entries.append({
'id': it['id'],
- 'title': title,
+ 'title': limit_length(title, 80),
'formats': formats,
'thumbnail': thumbnail,
'webpage_url': it.get('link'),
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 8529bedfc..36baf3245 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -6,16 +6,15 @@ from random import random
from math import floor
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
ExtractorError,
+ remove_end,
+ sanitized_Request,
)
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
@@ -23,7 +22,7 @@ class IPrimaIE(InfoExtractor):
'id': '39152',
'ext': 'flv',
'title': 'Partička (92)',
- 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6',
+ 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45',
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
@@ -35,13 +34,14 @@ class IPrimaIE(InfoExtractor):
'id': '9718337',
'ext': 'flv',
'title': 'Tchibo Partička - Jarní móda',
- 'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
'thumbnail': 're:^http:.*\.jpg$',
},
'params': {
'skip_download': True, # requires rtmpdump
},
- 'skip': 'Do not have permission to access this page',
+ }, {
+ 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -59,7 +59,7 @@ class IPrimaIE(InfoExtractor):
(floor(random() * 1073741824), floor(random() * 1073741824))
)
- req = compat_urllib_request.Request(player_url)
+ req = sanitized_Request(player_url)
req.add_header('Referer', url)
playerpage = self._download_webpage(req, video_id)
@@ -102,8 +102,10 @@ class IPrimaIE(InfoExtractor):
return {
'id': real_id,
- 'title': self._og_search_title(webpage),
+ 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'),
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
- 'description': self._og_search_description(webpage),
+ 'description': self._search_regex(
+ r'<p[^>]+itemprop="description"[^>]*>([^<]+)',
+ webpage, 'description', default=None),
}
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
new file mode 100644
index 000000000..2df1da3f0
--- /dev/null
+++ b/youtube_dl/extractor/iqiyi.py
@@ -0,0 +1,279 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import math
+import random
+import time
+import uuid
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import ExtractorError
+
+
+class IqiyiIE(InfoExtractor):
+ IE_NAME = 'iqiyi'
+ IE_DESC = '爱奇艺'
+
+ _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html'
+
+ _TESTS = [{
+ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
+ 'md5': '2cb594dc2781e6c941a110d8f358118b',
+ 'info_dict': {
+ 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+ 'title': '美国德州空中惊现奇异云团 酷似UFO',
+ 'ext': 'f4v',
+ }
+ }, {
+ 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb',
+ 'title': '名侦探柯南第752集',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
+ 'ext': 'f4v',
+ 'title': '名侦探柯南第752集',
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ _FORMATS_MAP = [
+ ('1', 'h6'),
+ ('2', 'h5'),
+ ('3', 'h4'),
+ ('4', 'h3'),
+ ('5', 'h2'),
+ ('10', 'h1'),
+ ]
+
+ @staticmethod
+ def md5_text(text):
+ return hashlib.md5(text.encode('utf-8')).hexdigest()
+
+ def construct_video_urls(self, data, video_id, _uuid):
+ def do_xor(x, y):
+ a = y % 3
+ if a == 1:
+ return x ^ 121
+ if a == 2:
+ return x ^ 72
+ return x ^ 103
+
+ def get_encode_code(l):
+ a = 0
+ b = l.split('-')
+ c = len(b)
+ s = ''
+ for i in range(c - 1, -1, -1):
+ a = do_xor(int(b[c - i - 1], 16), i)
+ s += chr(a)
+ return s[::-1]
+
+ def get_path_key(x, format_id, segment_index):
+ mg = ')(*&^flash@#$%a'
+ tm = self._download_json(
+ 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
+ note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
+ )['t']
+ t = str(int(math.floor(int(tm) / (600.0))))
+ return self.md5_text(t + mg + x)
+
+ video_urls_dict = {}
+ for format_item in data['vp']['tkl'][0]['vs']:
+ if 0 < int(format_item['bid']) <= 10:
+ format_id = self.get_format(format_item['bid'])
+ else:
+ continue
+
+ video_urls = []
+
+ video_urls_info = format_item['fs']
+ if not format_item['fs'][0]['l'].startswith('/'):
+ t = get_encode_code(format_item['fs'][0]['l'])
+ if t.endswith('mp4'):
+ video_urls_info = format_item['flvs']
+
+ for segment_index, segment in enumerate(video_urls_info):
+ vl = segment['l']
+ if not vl.startswith('/'):
+ vl = get_encode_code(vl)
+ key = get_path_key(
+ vl.split('/')[-1].split('.')[0], format_id, segment_index)
+ filesize = segment['b']
+ base_url = data['vp']['du'].split('/')
+ base_url.insert(-1, key)
+ base_url = '/'.join(base_url)
+ param = {
+ 'su': _uuid,
+ 'qyid': uuid.uuid4().hex,
+ 'client': '',
+ 'z': '',
+ 'bt': '',
+ 'ct': '',
+ 'tn': str(int(time.time()))
+ }
+ api_video_url = base_url + vl + '?' + \
+ compat_urllib_parse.urlencode(param)
+ js = self._download_json(
+ api_video_url, video_id,
+ note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
+ video_url = js['l']
+ video_urls.append(
+ (video_url, filesize))
+
+ video_urls_dict[format_id] = video_urls
+ return video_urls_dict
+
+ def get_format(self, bid):
+ matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
+ return matched_format_ids[0] if len(matched_format_ids) else None
+
+ def get_bid(self, format_id):
+ matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
+ return matched_bids[0] if len(matched_bids) else None
+
+ def get_raw_data(self, tvid, video_id, enc_key, _uuid):
+ tm = str(int(time.time()))
+ tail = tm + tvid
+ param = {
+ 'key': 'fvip',
+ 'src': self.md5_text('youtube-dl'),
+ 'tvId': tvid,
+ 'vid': video_id,
+ 'vinfo': 1,
+ 'tm': tm,
+ 'enc': self.md5_text(enc_key + tail),
+ 'qyid': _uuid,
+ 'tn': random.random(),
+ 'um': 0,
+ 'authkey': self.md5_text(self.md5_text('') + tail),
+ }
+
+ api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
+ compat_urllib_parse.urlencode(param)
+ raw_data = self._download_json(api_url, video_id)
+ return raw_data
+
+ def get_enc_key(self, swf_url, video_id):
+ # TODO: automatic key extraction
+ # last update at 2015-10-22 for Zombie::bite
+ # '7223c67061dbea1259d0ceb44f44b6d62288f4f80c972170de5201d2321060270e05'[2:66][0::2]
+ enc_key = '2c76de15dcb44bd28ff0927d50d31620'
+ return enc_key
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(
+ url, 'temp_id', note='download video page')
+ tvid = self._search_regex(
+ r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+ video_id = self._search_regex(
+ r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+ swf_url = self._search_regex(
+ r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
+ _uuid = uuid.uuid4().hex
+
+ enc_key = self.get_enc_key(swf_url, video_id)
+
+ raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
+
+ if raw_data['code'] != 'A000000':
+ raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+ if not raw_data['data']['vp']['tkl']:
+ raise ExtractorError('No support iQiqy VIP video')
+
+ data = raw_data['data']
+
+ title = data['vi']['vn']
+
+ # generate video_urls_dict
+ video_urls_dict = self.construct_video_urls(
+ data, video_id, _uuid)
+
+ # construct info
+ entries = []
+ for format_id in video_urls_dict:
+ video_urls = video_urls_dict[format_id]
+ for i, video_url_info in enumerate(video_urls):
+ if len(entries) < i + 1:
+ entries.append({'formats': []})
+ entries[i]['formats'].append(
+ {
+ 'url': video_url_info[0],
+ 'filesize': video_url_info[-1],
+ 'format_id': format_id,
+ 'preference': int(self.get_bid(format_id))
+ }
+ )
+
+ for i in range(len(entries)):
+ self._sort_formats(entries[i]['formats'])
+ entries[i].update(
+ {
+ 'id': '%s_part%d' % (video_id, i + 1),
+ 'title': title,
+ }
+ )
+
+ if len(entries) > 1:
+ info = {
+ '_type': 'multi_video',
+ 'id': video_id,
+ 'title': title,
+ 'entries': entries,
+ }
+ else:
+ info = entries[0]
+ info['id'] = video_id
+ info['title'] = title
+
+ return info
diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py
new file mode 100644
index 000000000..214bcd5b5
--- /dev/null
+++ b/youtube_dl/extractor/ir90tv.py
@@ -0,0 +1,42 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import remove_start
+
+
+class Ir90TvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*'
+ _TESTS = [{
+ 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218',
+ 'md5': '411dbd94891381960cb9e13daa47a869',
+ 'info_dict': {
+ 'id': '95719',
+ 'ext': 'mp4',
+ 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = remove_start(self._html_search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title'), '90tv.ir :: ')
+
+ video_url = self._search_regex(
+ r'<source[^>]+src="([^"]+)"', webpage, 'video url')
+
+ thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)
+
+ return {
+ 'url': video_url,
+ 'id': video_id,
+ 'title': title,
+ 'video_url': video_url,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index e82594444..029878d24 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -5,11 +5,9 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
@@ -78,7 +76,7 @@ class IviIE(InfoExtractor):
]
}
- request = compat_urllib_request.Request(api_url, json.dumps(data))
+ request = sanitized_Request(api_url, json.dumps(data))
video_json_page = self._download_webpage(
request, video_id, 'Downloading video JSON')
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index 99a1361f8..bc226fa67 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
float_or_none,
@@ -30,7 +31,7 @@ class IzleseneIE(InfoExtractor):
'description': 'md5:253753e2655dde93f59f74b572454f6d',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'pelikzzle',
- 'timestamp': 1404302298,
+ 'timestamp': int,
'upload_date': '20140702',
'duration': 95.395,
'age_limit': 0,
@@ -46,7 +47,7 @@ class IzleseneIE(InfoExtractor):
'description': 'Tarkan Dortmund 2006 Konseri',
'thumbnail': 're:^http://.*\.jpg',
'uploader_id': 'parlayankiz',
- 'timestamp': 1163322193,
+ 'timestamp': int,
'upload_date': '20061112',
'duration': 253.666,
'age_limit': 0,
@@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor):
uploader = self._html_search_regex(
r"adduserUsername\s*=\s*'([^']+)';",
- webpage, 'uploader', fatal=False, default='')
+ webpage, 'uploader', fatal=False)
timestamp = parse_iso8601(self._html_search_meta(
- 'uploadDate', webpage, 'upload date', fatal=False))
+ 'uploadDate', webpage, 'upload date'))
duration = float_or_none(self._html_search_regex(
r'"videoduration"\s*:\s*"([^"]+)"',
@@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor):
# Might be empty for some videos.
streams = self._html_search_regex(
- r'"qualitylevel"\s*:\s*"([^"]+)"',
- webpage, 'streams', fatal=False, default='')
+ r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='')
formats = []
if streams:
@@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor):
quality, url = re.search(r'\[(\w+)\](.+)', stream).groups()
formats.append({
'format_id': '%sp' % quality if quality else 'sd',
- 'url': url,
+ 'url': compat_urllib_parse_unquote(url),
'ext': ext,
})
else:
stream_url = self._search_regex(
- r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL')
+ r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL')
formats.append({
'format_id': 'sd',
- 'url': stream_url,
+ 'url': compat_urllib_parse_unquote(stream_url),
'ext': ext,
})
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index 8094cc2e4..eef7daa29 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -2,50 +2,55 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
class JeuxVideoIE(InfoExtractor):
- _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm'
+ _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
'md5': '046e491afb32a8aaac1f44dd4ddd54ee',
'info_dict': {
- 'id': '5182',
+ 'id': '114765',
'ext': 'mp4',
- 'title': 'GC 2013 : Tearaway nous présente ses papiers d\'identité',
- 'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.\n',
+ 'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité',
+ 'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.',
},
- }
+ }, {
+ 'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group(1)
webpage = self._download_webpage(url, title)
- xml_link = self._html_search_regex(
- r'<param name="flashvars" value="config=(.*?)" />',
+ title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
+ config_url = self._html_search_regex(
+ r'data-src="(/contenu/medias/video.php.*?)"',
webpage, 'config URL')
+ config_url = 'http://www.jeuxvideo.com' + config_url
video_id = self._search_regex(
- r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
- xml_link, 'video ID')
+ r'id=(\d+)',
+ config_url, 'video ID')
- config = self._download_xml(
- xml_link, title, 'Downloading XML config')
- info_json = config.find('format.json').text
- info = json.loads(info_json)['versions'][0]
+ config = self._download_json(
+ config_url, title, 'Downloading JSON config')
- video_url = 'http://video720.jeuxvideo.com/' + info['file']
+ formats = [{
+ 'url': source['file'],
+ 'format_id': source['label'],
+ 'resolution': source['label'],
+ } for source in reversed(config['sources'])]
return {
'id': video_id,
- 'title': config.find('titre_video').text,
- 'ext': 'mp4',
- 'url': video_url,
+ 'title': title,
+ 'formats': formats,
'description': self._og_search_description(webpage),
- 'thumbnail': config.find('image').text,
+ 'thumbnail': config.get('image'),
}
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
new file mode 100644
index 000000000..583b1a5ad
--- /dev/null
+++ b/youtube_dl/extractor/kaltura.py
@@ -0,0 +1,176 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import base64
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urlparse,
+)
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ int_or_none,
+ unsmuggle_url,
+)
+
+
+class KalturaIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
+ https?://
+ (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/
+ (?:
+ (?:
+ # flash player
+ index\.php/kwidget/
+ (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/
+ (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)|
+ # html5 player
+ html5/html5lib/
+ (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+)
+ .*\?.*\bwid=_(?P<partner_id_html5>\d+)
+ )
+ )
+ )
+ '''
+ _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
+ _TESTS = [
+ {
+ 'url': 'kaltura:269692:1_1jc2y3e4',
+ 'md5': '3adcbdb3dcc02d647539e53f284ba171',
+ 'info_dict': {
+ 'id': '1_1jc2y3e4',
+ 'ext': 'mp4',
+ 'title': 'Track 4',
+ 'upload_date': '20131219',
+ 'uploader_id': 'mlundberg@wolfgangsvault.com',
+ 'description': 'The Allman Brothers Band, 12/16/1981',
+ 'thumbnail': 're:^https?://.*/thumbnail/.*',
+ 'timestamp': int,
+ },
+ },
+ {
+ 'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342',
+ 'only_matching': True,
+ }
+ ]
+
+ def _kaltura_api_call(self, video_id, actions, *args, **kwargs):
+ params = actions[0]
+ if len(actions) > 1:
+ for i, a in enumerate(actions[1:], start=1):
+ for k, v in a.items():
+ params['%d:%s' % (i, k)] = v
+
+ query = compat_urllib_parse.urlencode(params)
+ url = self._API_BASE + query
+ data = self._download_json(url, video_id, *args, **kwargs)
+
+ status = data if len(actions) == 1 else data[0]
+ if status.get('objectType') == 'KalturaAPIException':
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, status['message']))
+
+ return data
+
+ def _get_kaltura_signature(self, video_id, partner_id):
+ actions = [{
+ 'apiVersion': '3.1',
+ 'expiry': 86400,
+ 'format': 1,
+ 'service': 'session',
+ 'action': 'startWidgetSession',
+ 'widgetId': '_%s' % partner_id,
+ }]
+ return self._kaltura_api_call(
+ video_id, actions, note='Downloading Kaltura signature')['ks']
+
+ def _get_video_info(self, video_id, partner_id):
+ signature = self._get_kaltura_signature(video_id, partner_id)
+ actions = [
+ {
+ 'action': 'null',
+ 'apiVersion': '3.1.5',
+ 'clientTag': 'kdp:v3.8.5',
+ 'format': 1, # JSON, 2 = XML, 3 = PHP
+ 'service': 'multirequest',
+ 'ks': signature,
+ },
+ {
+ 'action': 'get',
+ 'entryId': video_id,
+ 'service': 'baseentry',
+ 'version': '-1',
+ },
+ {
+ 'action': 'getContextData',
+ 'contextDataParams:objectType': 'KalturaEntryContextDataParams',
+ 'contextDataParams:referrer': 'http://www.kaltura.com/',
+ 'contextDataParams:streamerType': 'http',
+ 'entryId': video_id,
+ 'service': 'baseentry',
+ },
+ ]
+ return self._kaltura_api_call(
+ video_id, actions, note='Downloading video info JSON')
+
+ def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
+ mobj = re.match(self._VALID_URL, url)
+ partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
+ entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
+
+ info, source_data = self._get_video_info(entry_id, partner_id)
+
+ source_url = smuggled_data.get('source_url')
+ if source_url:
+ referrer = base64.b64encode(
+ '://'.join(compat_urlparse.urlparse(source_url)[:2])
+ .encode('utf-8')).decode('utf-8')
+ else:
+ referrer = None
+
+ formats = []
+ for f in source_data['flavorAssets']:
+ video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])
+ if referrer:
+ video_url += '?referrer=%s' % referrer
+ formats.append({
+ 'format_id': '%(fileExt)s-%(bitrate)s' % f,
+ 'ext': f.get('fileExt'),
+ 'tbr': int_or_none(f['bitrate']),
+ 'fps': int_or_none(f.get('frameRate')),
+ 'filesize_approx': int_or_none(f.get('size'), invscale=1024),
+ 'container': f.get('containerFormat'),
+ 'vcodec': f.get('videoCodecId'),
+ 'height': int_or_none(f.get('height')),
+ 'width': int_or_none(f.get('width')),
+ 'url': video_url,
+ })
+ self._check_formats(formats, entry_id)
+ self._sort_formats(formats)
+
+ return {
+ 'id': entry_id,
+ 'title': info['name'],
+ 'formats': formats,
+ 'description': clean_html(info.get('description')),
+ 'thumbnail': info.get('thumbnailUrl'),
+ 'duration': info.get('duration'),
+ 'timestamp': info.get('createdAt'),
+ 'uploader_id': info.get('userId'),
+ 'view_count': info.get('plays'),
+ }
diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py
new file mode 100644
index 000000000..4597d1b96
--- /dev/null
+++ b/youtube_dl/extractor/kanalplay.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ srt_subtitles_timecode,
+)
+
+
+class KanalPlayIE(InfoExtractor):
+ IE_DESC = 'Kanal 5/9/11 Play'
+ _VALID_URL = r'https?://(?:www\.)?kanal(?P<channel_id>5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277',
+ 'info_dict': {
+ 'id': '3270012277',
+ 'ext': 'flv',
+ 'title': 'Saknar både dusch och avlopp',
+ 'description': 'md5:6023a95832a06059832ae93bc3c7efb7',
+ 'duration': 2636.36,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199',
+ 'only_matching': True,
+ }]
+
+ def _fix_subtitles(self, subs):
+ return '\r\n\r\n'.join(
+ '%s\r\n%s --> %s\r\n%s'
+ % (
+ num,
+ srt_subtitles_timecode(item['startMillis'] / 1000.0),
+ srt_subtitles_timecode(item['endMillis'] / 1000.0),
+ item['text'],
+ ) for num, item in enumerate(subs, 1))
+
+ def _get_subtitles(self, channel_id, video_id):
+ subs = self._download_json(
+ 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id),
+ video_id, 'Downloading subtitles JSON', fatal=False)
+ return {'se': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {}
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ channel_id = mobj.group('channel_id')
+
+ video = self._download_json(
+ 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id),
+ video_id)
+
+ reasons_for_no_streams = video.get('reasonsForNoStreams')
+ if reasons_for_no_streams:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)),
+ expected=True)
+
+ title = video['title']
+ description = video.get('description')
+ duration = float_or_none(video.get('length'), 1000)
+ thumbnail = video.get('posterUrl')
+
+ stream_base_url = video['streamBaseUrl']
+
+ formats = [{
+ 'url': stream_base_url,
+ 'play_path': stream['source'],
+ 'ext': 'flv',
+ 'tbr': float_or_none(stream.get('bitrate'), 1000),
+ 'rtmp_real_time': True,
+ } for stream in video['streams']]
+ self._sort_formats(formats)
+
+ subtitles = {}
+ if video.get('hasSubtitle'):
+ subtitles = self.extract_subtitles(channel_id, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py
index e3b43ff8d..06daf5a89 100644
--- a/youtube_dl/extractor/karaoketv.py
+++ b/youtube_dl/extractor/karaoketv.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
from ..utils import (
js_to_json,
)
@@ -24,7 +24,7 @@ class KaraoketvIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
page_video_url = self._og_search_video_url(webpage, video_id)
- config_json = compat_urllib_parse.unquote_plus(self._search_regex(
+ config_json = compat_urllib_parse_unquote_plus(self._search_regex(
r'config=(.*)', page_video_url, 'configuration'))
urls_info_json = self._download_json(
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644
index 000000000..bed94bc93
--- /dev/null
+++ b/youtube_dl/extractor/karrierevideos.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ fix_xml_ampersands,
+ float_or_none,
+ xpath_with_ns,
+ xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+ 'info_dict': {
+ 'id': '32c91',
+ 'ext': 'flv',
+ 'title': 'AltenpflegerIn',
+ 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+ 'thumbnail': 're:^http://.*\.png',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ # broken ampersands
+ 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+ 'info_dict': {
+ 'id': '5sniu',
+ 'ext': 'flv',
+ 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+ 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+ 'thumbnail': 're:^http://.*\.png',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = (self._html_search_meta('title', webpage, default=None) or
+ self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+ video_id = self._search_regex(
+ r'/config/video/(.+?)\.xml', webpage, 'video id')
+ playlist = self._download_xml(
+ 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+ video_id, transform_source=fix_xml_ampersands)
+
+ NS_MAP = {
+ 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+ }
+
+ def ns(path):
+ return xpath_with_ns(path, NS_MAP)
+
+ item = playlist.find('./tracklist/item')
+ video_file = xpath_text(
+ item, ns('./jwplayer:file'), 'video url', fatal=True)
+ streamer = xpath_text(
+ item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+ uploader = xpath_text(
+ item, ns('./jwplayer:author'), 'uploader')
+ duration = float_or_none(
+ xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+ description = self._html_search_regex(
+ r'(?s)<div class="leadtext">(.+?)</div>',
+ webpage, 'description')
+
+ thumbnail = self._html_search_meta(
+ 'thumbnail', webpage, 'thumbnail')
+ if thumbnail:
+ thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+ return {
+ 'id': video_id,
+ 'url': streamer.replace('rtmpt', 'rtmp'),
+ 'play_path': 'mp4:%s' % video_file,
+ 'ext': 'flv',
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py
index c0956ba09..94a03d277 100644
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -1,46 +1,39 @@
+# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class KeekIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<id>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?keek\.com/keek/(?P<id>\w+)'
IE_NAME = 'keek'
_TEST = {
- 'url': 'https://www.keek.com/ytdl/keeks/NODfbab',
- 'md5': '09c5c109067536c1cec8bac8c21fea05',
+ 'url': 'https://www.keek.com/keek/NODfbab',
+ 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83',
'info_dict': {
'id': 'NODfbab',
'ext': 'mp4',
- 'uploader': 'youtube-dl project',
- 'uploader_id': 'ytdl',
- 'title': 'test chars: "\'/\\\u00e4<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de .',
+ 'title': 'md5:35d42050a3ece241d5ddd7fdcc6fd896',
+ 'uploader': 'ytdl',
+ 'uploader_id': 'eGT5bab',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
- video_url = 'http://cdn.keek.com/keek/video/%s' % video_id
- thumbnail = 'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- raw_desc = self._html_search_meta('description', webpage)
- if raw_desc:
- uploader = self._html_search_regex(
- r'Watch (.*?)\s+\(', raw_desc, 'uploader', fatal=False)
- uploader_id = self._html_search_regex(
- r'Watch .*?\(@(.+?)\)', raw_desc, 'uploader_id', fatal=False)
- else:
- uploader = None
- uploader_id = None
-
return {
'id': video_id,
- 'url': video_url,
+ 'url': self._og_search_video_url(webpage),
'ext': 'mp4',
- 'title': self._og_search_title(webpage),
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
+ 'title': self._og_search_description(webpage).strip(),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader': self._search_regex(
+ r'data-username=(["\'])(?P<uploader>.+?)\1', webpage,
+ 'uploader', fatal=False, group='uploader'),
+ 'uploader_id': self._search_regex(
+ r'data-user-id=(["\'])(?P<uploader_id>.+?)\1', webpage,
+ 'uploader id', fatal=False, group='uploader_id'),
}
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 82eddec51..d79261bb5 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -4,10 +4,8 @@ import os
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import sanitized_Request
class KeezMoviesIE(InfoExtractor):
@@ -26,7 +24,7 @@ class KeezMoviesIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py
index 7d4b57056..1d391e69f 100644
--- a/youtube_dl/extractor/kickstarter.py
+++ b/youtube_dl/extractor/kickstarter.py
@@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor):
'uploader': 'Pebble Technology',
'title': 'Pebble iOS Notifications',
}
+ }, {
+ 'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',
+ 'info_dict': {
+ 'id': '1420158244',
+ 'ext': 'mp4',
+ 'title': 'Power Drive 2000',
+ },
+ 'expected_warnings': ['OpenGraph description'],
}]
def _real_extract(self, url):
@@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor):
'title': title,
}
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
+ if thumbnail is None:
+ thumbnail = self._html_search_regex(
+ r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"',
+ webpage, 'thumbnail image', fatal=False)
return {
'id': video_id,
'url': video_url,
'title': title,
'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
index 720bc939b..a59c529f4 100644
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
class KontrTubeIE(InfoExtractor):
@@ -34,33 +37,28 @@ class KontrTubeIE(InfoExtractor):
webpage = self._download_webpage(
url, display_id, 'Downloading page')
- video_url = self._html_search_regex(
+ video_url = self._search_regex(
r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL')
- thumbnail = self._html_search_regex(
- r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False)
+ thumbnail = self._search_regex(
+ r"preview_url\s*:\s*'(.+?)/?',", webpage, 'thumbnail', fatal=False)
title = self._html_search_regex(
- r'<title>(.+?)</title>', webpage, 'video title')
+ r'(?s)<h2>(.+?)</h2>', webpage, 'title')
description = self._html_search_meta(
- 'description', webpage, 'video description')
+ 'description', webpage, 'description')
- mobj = re.search(
- r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
- webpage)
- duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
+ duration = self._search_regex(
+ r'Длительность: <em>([^<]+)</em>', webpage, 'duration', fatal=False)
+ if duration:
+ duration = parse_duration(duration.replace('мин', 'min').replace('сек', 'sec'))
- view_count = self._html_search_regex(
- r'<div class="col_2">Просмотров: <span>(\d+)</span></div>',
+ view_count = self._search_regex(
+ r'Просмотров: <em>([^<]+)</em>',
webpage, 'view count', fatal=False)
+ if view_count:
+ view_count = int_or_none(view_count.replace(' ', ''))
- comment_count = None
- comment_str = self._html_search_regex(
- r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False)
- if comment_str.startswith('комментариев нет'):
- comment_count = 0
- else:
- mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
- if mobj:
- comment_count = mobj.group('total')
+ comment_count = int_or_none(self._search_regex(
+ r'Комментарии \((\d+)\)<', webpage, ' comment count', fatal=False))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py
index e46954b47..0ae8ebd68 100644
--- a/youtube_dl/extractor/krasview.py
+++ b/youtube_dl/extractor/krasview.py
@@ -25,6 +25,9 @@ class KrasViewIE(InfoExtractor):
'duration': 27,
'thumbnail': 're:^https?://.*\.jpg',
},
+ 'params': {
+ 'skip_download': 'Not accessible from Travis CI server',
+ },
}
def _real_extract(self, url):
@@ -40,8 +43,10 @@ class KrasViewIE(InfoExtractor):
description = self._og_search_description(webpage, default=None)
thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage)
duration = int_or_none(flashvars.get('duration'))
- width = int_or_none(self._og_search_property('video:width', webpage, 'video width'))
- height = int_or_none(self._og_search_property('video:height', webpage, 'video height'))
+ width = int_or_none(self._og_search_property(
+ 'video:width', webpage, 'video width', default=None))
+ height = int_or_none(self._og_search_property(
+ 'video:height', webpage, 'video height', default=None))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
new file mode 100644
index 000000000..0c8ed5d07
--- /dev/null
+++ b/youtube_dl/extractor/kuwo.py
@@ -0,0 +1,318 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+ get_element_by_id,
+ clean_html,
+ ExtractorError,
+ remove_start,
+)
+
+
+class KuwoBaseIE(InfoExtractor):
+ _FORMATS = [
+ {'format': 'ape', 'ext': 'ape', 'preference': 100},
+ {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80},
+ {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70},
+ {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60},
+ {'format': 'wma', 'ext': 'wma', 'preference': 20},
+ {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}
+ ]
+
+ def _get_formats(self, song_id):
+ formats = []
+ for file_format in self._FORMATS:
+ song_url = self._download_webpage(
+ 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' %
+ (file_format['ext'], file_format.get('br', ''), song_id),
+ song_id, note='Download %s url info' % file_format['format'],
+ )
+ if song_url.startswith('http://') or song_url.startswith('https://'):
+ formats.append({
+ 'url': song_url,
+ 'format_id': file_format['format'],
+ 'format': file_format['format'],
+ 'preference': file_format['preference'],
+ 'abr': file_format.get('abr'),
+ })
+ self._sort_formats(formats)
+ return formats
+
+
+class KuwoIE(KuwoBaseIE):
+ IE_NAME = 'kuwo:song'
+ IE_DESC = '酷我音乐'
+ _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/'
+ _TESTS = [{
+ 'url': 'http://www.kuwo.cn/yinyue/635632/',
+ 'info_dict': {
+ 'id': '635632',
+ 'ext': 'ape',
+ 'title': '爱我别走',
+ 'creator': '张震岳',
+ 'upload_date': '20080122',
+ 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
+ },
+ 'skip': 'this song has been offline because of copyright issues',
+ }, {
+ 'url': 'http://www.kuwo.cn/yinyue/6446136/',
+ 'info_dict': {
+ 'id': '6446136',
+ 'ext': 'mp3',
+ 'title': '心',
+ 'creator': 'IU',
+ 'upload_date': '20150518',
+ },
+ 'params': {
+ 'format': 'mp3-320'
+ },
+ }]
+
+ def _real_extract(self, url):
+ song_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, song_id, note='Download song detail info',
+ errnote='Unable to get song detail info')
+ if '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage:
+ raise ExtractorError('this song has been offline because of copyright issues', expected=True)
+
+ song_name = self._html_search_regex(
+ r'(?s)class="(?:[^"\s]+\s+)*title(?:\s+[^"\s]+)*".*?<h1[^>]+title="([^"]+)"', webpage, 'song name')
+ singer_name = self._html_search_regex(
+ r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
+ webpage, 'singer name', fatal=False)
+ lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
+ if lrc_content == '暂无': # indicates no lyrics
+ lrc_content = None
+
+ formats = self._get_formats(song_id)
+
+ album_id = self._html_search_regex(
+ r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
+ webpage, 'album id', fatal=False)
+
+ publish_time = None
+ if album_id is not None:
+ album_info_page = self._download_webpage(
+ 'http://www.kuwo.cn/album/%s/' % album_id, song_id,
+ note='Download album detail info',
+ errnote='Unable to get album detail info')
+
+ publish_time = self._html_search_regex(
+ r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page,
+ 'publish time', fatal=False)
+ if publish_time:
+ publish_time = publish_time.replace('-', '')
+
+ return {
+ 'id': song_id,
+ 'title': song_name,
+ 'creator': singer_name,
+ 'upload_date': publish_time,
+ 'description': lrc_content,
+ 'formats': formats,
+ }
+
+
+class KuwoAlbumIE(InfoExtractor):
+ IE_NAME = 'kuwo:album'
+ IE_DESC = '酷我音乐 - 专辑'
+ _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/'
+ _TEST = {
+ 'url': 'http://www.kuwo.cn/album/502294/',
+ 'info_dict': {
+ 'id': '502294',
+ 'title': 'M',
+ 'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c',
+ },
+ 'playlist_count': 2,
+ }
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ url, album_id, note='Download album info',
+ errnote='Unable to get album info')
+
+ album_name = self._html_search_regex(
+ r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage,
+ 'album name')
+ album_intro = remove_start(
+ clean_html(get_element_by_id('intro', webpage)),
+ '%s简介:' % album_name)
+
+ entries = [
+ self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+ r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"',
+ webpage)
+ ]
+ return self.playlist_result(entries, album_id, album_name, album_intro)
+
+
+class KuwoChartIE(InfoExtractor):
+ IE_NAME = 'kuwo:chart'
+ IE_DESC = '酷我音乐 - 排行榜'
+ _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
+ _TEST = {
+ 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
+ 'info_dict': {
+ 'id': '香港中文龙虎榜',
+ 'title': '香港中文龙虎榜',
+ 'description': 're:\d{4}第\d{2}期',
+ },
+ 'playlist_mincount': 10,
+ }
+
+ def _real_extract(self, url):
+ chart_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, chart_id, note='Download chart info',
+ errnote='Unable to get chart info')
+
+ chart_name = self._html_search_regex(
+ r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name')
+
+ chart_desc = self._html_search_regex(
+ r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc')
+
+ entries = [
+ self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+ r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage)
+ ]
+ return self.playlist_result(entries, chart_id, chart_name, chart_desc)
+
+
+class KuwoSingerIE(InfoExtractor):
+ IE_NAME = 'kuwo:singer'
+ IE_DESC = '酷我音乐 - 歌手'
+ _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
+ 'info_dict': {
+ 'id': 'bruno+mars',
+ 'title': 'Bruno Mars',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
+ 'info_dict': {
+ 'id': 'Ali',
+ 'title': 'Ali',
+ },
+ 'playlist_mincount': 95,
+ 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540
+ }]
+
+ def _real_extract(self, url):
+ singer_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, singer_id, note='Download singer info',
+ errnote='Unable to get singer info')
+
+ singer_name = self._html_search_regex(
+ r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name'
+ )
+
+ entries = []
+ first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True
+ for page_num in itertools.count(1):
+ webpage = self._download_webpage(
+ 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num),
+ singer_id, note='Download song list page #%d' % page_num,
+ errnote='Unable to get song list page #%d' % page_num)
+
+ entries.extend([
+ self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+ r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/',
+ webpage)
+ ][:10 if first_page_only else None])
+
+ if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage):
+ break
+
+ return self.playlist_result(entries, singer_id, singer_name)
+
+
+class KuwoCategoryIE(InfoExtractor):
+ IE_NAME = 'kuwo:category'
+ IE_DESC = '酷我音乐 - 分类'
+ _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
+ _TEST = {
+ 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
+ 'info_dict': {
+ 'id': '86375',
+ 'title': '八十年代精选',
+ 'description': '这些都是属于八十年代的回忆!',
+ },
+ 'playlist_count': 30,
+ }
+
+ def _real_extract(self, url):
+ category_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, category_id, note='Download category info',
+ errnote='Unable to get category info')
+
+ category_name = self._html_search_regex(
+ r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name')
+
+ category_desc = remove_start(
+ get_element_by_id('intro', webpage).strip(),
+ '%s简介:' % category_name)
+
+ jsonm = self._parse_json(self._html_search_regex(
+ r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
+
+ entries = [
+ self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo')
+ for song in jsonm['musiclist']
+ ]
+ return self.playlist_result(entries, category_id, category_name, category_desc)
+
+
+class KuwoMvIE(KuwoBaseIE):
+ IE_NAME = 'kuwo:mv'
+ IE_DESC = '酷我音乐 - MV'
+ _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/'
+ _TEST = {
+ 'url': 'http://www.kuwo.cn/mv/6480076/',
+ 'info_dict': {
+ 'id': '6480076',
+ 'ext': 'mkv',
+ 'title': '我们家MV',
+ 'creator': '2PM',
+ },
+ }
+ _FORMATS = KuwoBaseIE._FORMATS + [
+ {'format': 'mkv', 'ext': 'mkv', 'preference': 250},
+ {'format': 'mp4', 'ext': 'mp4', 'preference': 200},
+ ]
+
+ def _real_extract(self, url):
+ song_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, song_id, note='Download mv detail info: %s' % song_id,
+ errnote='Unable to get mv detail info: %s' % song_id)
+
+ mobj = re.search(
+ r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"',
+ webpage)
+ if mobj:
+ song_name = mobj.group('song')
+ singer_name = mobj.group('singer')
+ else:
+ raise ExtractorError('Unable to find song or singer names')
+
+ formats = self._get_formats(song_id)
+
+ return {
+ 'id': song_id,
+ 'title': song_name,
+ 'creator': singer_name,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 2fd3b4699..b459559b0 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -1,31 +1,32 @@
+# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import random
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ xpath_text,
+)
class Laola1TvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/.*?/(?P<id>[0-9]+)\.html'
_TEST = {
- 'url': 'http://www.laola1.tv/de-de/live/bwf-bitburger-open-grand-prix-gold-court-1/250019.html',
+ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html',
'info_dict': {
- 'id': '250019',
+ 'id': '227883',
'ext': 'mp4',
- 'title': 'Bitburger Open Grand Prix Gold - Court 1',
- 'categories': ['Badminton'],
- 'uploader': 'BWF - Badminton World Federation',
- 'is_live': True,
+ 'title': 'Straubing Tigers - Kölner Haie',
+ 'categories': ['Eishockey'],
+ 'is_live': False,
},
'params': {
'skip_download': True,
}
}
- _BROKEN = True # Not really - extractor works fine, but f4m downloader does not support live streams yet.
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
@@ -43,15 +44,22 @@ class Laola1TvIE(InfoExtractor):
r'flashvars\.([_a-zA-Z0-9]+)\s*=\s*"([^"]*)";', iframe)
flashvars = dict((m[0], m[1]) for m in flashvars_m)
+ partner_id = self._search_regex(
+ r'partnerid\s*:\s*"([^"]+)"', iframe, 'partner id')
+
xml_url = ('http://www.laola1.tv/server/hd_video.php?' +
- 'play=%s&partner=1&portal=%s&v5ident=&lang=%s' % (
- video_id, portal, lang))
+ 'play=%s&partner=%s&portal=%s&v5ident=&lang=%s' % (
+ video_id, partner_id, portal, lang))
hd_doc = self._download_xml(xml_url, video_id)
- title = hd_doc.find('.//video/title').text
- flash_url = hd_doc.find('.//video/url').text
- categories = hd_doc.find('.//video/meta_sports').text.split(',')
- uploader = hd_doc.find('.//video/meta_organistation').text
+ title = xpath_text(hd_doc, './/video/title', fatal=True)
+ flash_url = xpath_text(hd_doc, './/video/url', fatal=True)
+ uploader = xpath_text(hd_doc, './/video/meta_organistation')
+ is_live = xpath_text(hd_doc, './/video/islive') == 'true'
+
+ categories = xpath_text(hd_doc, './/video/meta_sports')
+ if categories:
+ categories = categories.split(',')
ident = random.randint(10000000, 99999999)
token_url = '%s&ident=%s&klub=0&unikey=0&timestamp=%s&auth=%s' % (
@@ -60,15 +68,16 @@ class Laola1TvIE(InfoExtractor):
token_doc = self._download_xml(
token_url, video_id, note='Downloading token')
token_attrib = token_doc.find('.//token').attrib
- if token_attrib.get('auth') == 'blocked':
- raise ExtractorError('Token error: ' % token_attrib.get('comment'))
+ if token_attrib.get('auth') in ('blocked', 'restricted'):
+ raise ExtractorError(
+ 'Token error: %s' % token_attrib.get('comment'), expected=True)
video_url = '%s?hdnea=%s&hdcore=3.2.0' % (
token_attrib['url'], token_attrib['auth'])
return {
'id': video_id,
- 'is_live': True,
+ 'is_live': is_live,
'title': title,
'url': video_url,
'uploader': uploader,
diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py
new file mode 100644
index 000000000..40a3d2346
--- /dev/null
+++ b/youtube_dl/extractor/lecture2go.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ parse_duration,
+ int_or_none,
+)
+
+
+class Lecture2GoIE(InfoExtractor):
+ _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473',
+ 'md5': 'ac02b570883020d208d405d5a3fd2f7f',
+ 'info_dict': {
+ 'id': '17473',
+ 'ext': 'flv',
+ 'title': '2 - Endliche Automaten und reguläre Sprachen',
+ 'creator': 'Frank Heitmann',
+ 'duration': 5220,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<em[^>]+class="title">(.+)</em>', webpage, 'title')
+
+ formats = []
+ for url in set(re.findall(r'"src","([^"]+)"', webpage)):
+ ext = determine_ext(url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(url, video_id))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(url, video_id))
+ else:
+ formats.append({
+ 'url': url,
+ })
+
+ self._sort_formats(formats)
+
+ creator = self._html_search_regex(
+ r'<div[^>]+id="description">([^<]+)</div>', webpage, 'creator', fatal=False)
+ duration = parse_duration(self._html_search_regex(
+ r'Duration:\s*</em>\s*<em[^>]*>([^<]+)</em>', webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._html_search_regex(
+ r'Views:\s*</em>\s*<em[^>]+>(\d+)</em>', webpage, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'creator': creator,
+ 'duration': duration,
+ 'view_count': view_count,
+ }
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py
new file mode 100644
index 000000000..be648000e
--- /dev/null
+++ b/youtube_dl/extractor/letv.py
@@ -0,0 +1,241 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import datetime
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_ord,
+)
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ parse_iso8601,
+ sanitized_Request,
+ int_or_none,
+ encode_data_uri,
+)
+
+
+class LetvIE(InfoExtractor):
+ IE_DESC = '乐视网'
+ _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
+
+ _TESTS = [{
+ 'url': 'http://www.letv.com/ptv/vplay/22005890.html',
+ 'md5': 'edadcfe5406976f42f9f266057ee5e40',
+ 'info_dict': {
+ 'id': '22005890',
+ 'ext': 'mp4',
+ 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家',
+ 'description': 'md5:a9cb175fd753e2962176b7beca21a47c',
+ },
+ 'params': {
+ 'hls_prefer_native': True,
+ },
+ }, {
+ 'url': 'http://www.letv.com/ptv/vplay/1415246.html',
+ 'info_dict': {
+ 'id': '1415246',
+ 'ext': 'mp4',
+ 'title': '美人天下01',
+ 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
+ },
+ 'params': {
+ 'hls_prefer_native': True,
+ },
+ }, {
+ 'note': 'This video is available only in Mainland China, thus a proxy is needed',
+ 'url': 'http://www.letv.com/ptv/vplay/1118082.html',
+ 'md5': '2424c74948a62e5f31988438979c5ad1',
+ 'info_dict': {
+ 'id': '1118082',
+ 'ext': 'mp4',
+ 'title': '与龙共舞 完整版',
+ 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
+ },
+ 'params': {
+ 'hls_prefer_native': True,
+ },
+ 'skip': 'Only available in China',
+ }]
+
+ @staticmethod
+ def urshift(val, n):
+ return val >> n if val >= 0 else (val + 0x100000000) >> n
+
+ # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
+ def ror(self, param1, param2):
+ _loc3_ = 0
+ while _loc3_ < param2:
+ param1 = self.urshift(param1, 1) + ((param1 & 1) << 31)
+ _loc3_ += 1
+ return param1
+
+ def calc_time_key(self, param1):
+ _loc2_ = 773625421
+ _loc3_ = self.ror(param1, _loc2_ % 13)
+ _loc3_ = _loc3_ ^ _loc2_
+ _loc3_ = self.ror(_loc3_, _loc2_ % 17)
+ return _loc3_
+
+ # see M3U8Encryption class in KLetvPlayer.swf
+ @staticmethod
+ def decrypt_m3u8(encrypted_data):
+ if encrypted_data[:5].decode('utf-8').lower() != 'vc_01':
+ return encrypted_data
+ encrypted_data = encrypted_data[5:]
+
+ _loc4_ = bytearray()
+ while encrypted_data:
+ b = compat_ord(encrypted_data[0])
+ _loc4_.extend([b // 16, b & 0x0f])
+ encrypted_data = encrypted_data[1:]
+ idx = len(_loc4_) - 11
+ _loc4_ = _loc4_[idx:] + _loc4_[:idx]
+ _loc7_ = bytearray()
+ while _loc4_:
+ _loc7_.append(_loc4_[0] * 16 + _loc4_[1])
+ _loc4_ = _loc4_[2:]
+
+ return bytes(_loc7_)
+
+ def _real_extract(self, url):
+ media_id = self._match_id(url)
+ page = self._download_webpage(url, media_id)
+ params = {
+ 'id': media_id,
+ 'platid': 1,
+ 'splatid': 101,
+ 'format': 1,
+ 'tkey': self.calc_time_key(int(time.time())),
+ 'domain': 'www.letv.com'
+ }
+ play_json_req = sanitized_Request(
+ 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
+ )
+ cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+ if cn_verification_proxy:
+ play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy)
+
+ play_json = self._download_json(
+ play_json_req,
+ media_id, 'Downloading playJson data')
+
+ # Check for errors
+ playstatus = play_json['playstatus']
+ if playstatus['status'] == 0:
+ flag = playstatus['flag']
+ if flag == 1:
+ msg = 'Country %s auth error' % playstatus['country']
+ else:
+ msg = 'Generic error. flag = %d' % flag
+ raise ExtractorError(msg, expected=True)
+
+ playurl = play_json['playurl']
+
+ formats = ['350', '1000', '1300', '720p', '1080p']
+ dispatch = playurl['dispatch']
+
+ urls = []
+ for format_id in formats:
+ if format_id in dispatch:
+ media_url = playurl['domain'][0] + dispatch[format_id][0]
+ media_url += '&' + compat_urllib_parse.urlencode({
+ 'm3v': 1,
+ 'format': 1,
+ 'expect': 3,
+ 'rateid': format_id,
+ })
+
+ nodes_data = self._download_json(
+ media_url, media_id,
+ 'Download JSON metadata for format %s' % format_id)
+
+ req = self._request_webpage(
+ nodes_data['nodelist'][0]['location'], media_id,
+ note='Downloading m3u8 information for format %s' % format_id)
+
+ m3u8_data = self.decrypt_m3u8(req.read())
+
+ url_info_dict = {
+ 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
+ 'ext': determine_ext(dispatch[format_id][1]),
+ 'format_id': format_id,
+ 'protocol': 'm3u8',
+ }
+
+ if format_id[-1:] == 'p':
+ url_info_dict['height'] = int_or_none(format_id[:-1])
+
+ urls.append(url_info_dict)
+
+ publish_time = parse_iso8601(self._html_search_regex(
+ r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
+ delimiter=' ', timezone=datetime.timedelta(hours=8))
+ description = self._html_search_meta('description', page, fatal=False)
+
+ return {
+ 'id': media_id,
+ 'formats': urls,
+ 'title': playurl['title'],
+ 'thumbnail': playurl['pic'],
+ 'description': description,
+ 'timestamp': publish_time,
+ }
+
+
+class LetvTvIE(InfoExtractor):
+ _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html'
+ _TESTS = [{
+ 'url': 'http://www.letv.com/tv/46177.html',
+ 'info_dict': {
+ 'id': '46177',
+ 'title': '美人天下',
+ 'description': 'md5:395666ff41b44080396e59570dbac01c'
+ },
+ 'playlist_count': 35
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ page = self._download_webpage(url, playlist_id)
+
+ media_urls = list(set(re.findall(
+ r'http://www.letv.com/ptv/vplay/\d+.html', page)))
+ entries = [self.url_result(media_url, ie='Letv')
+ for media_url in media_urls]
+
+ title = self._html_search_meta('keywords', page,
+ fatal=False).split(',')[0]
+ description = self._html_search_meta('description', page, fatal=False)
+
+ return self.playlist_result(entries, playlist_id, playlist_title=title,
+ playlist_description=description)
+
+
+class LetvPlaylistIE(LetvTvIE):
+ _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html'
+ _TESTS = [{
+ 'url': 'http://tv.letv.com/izt/wuzetian/index.html',
+ 'info_dict': {
+ 'id': 'wuzetian',
+ 'title': '武媚娘传奇',
+ 'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
+ },
+ # This playlist contains some extra videos other than the drama itself
+ 'playlist_mincount': 96
+ }, {
+ 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml',
+ 'info_dict': {
+ 'id': 'lswjzzjc',
+ # The title should be "劲舞青春", but I can't find a simple way to
+ # determine the playlist title
+ 'title': '乐视午间自制剧场',
+ 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489'
+ },
+ 'playlist_mincount': 7
+ }]
diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py
new file mode 100644
index 000000000..d375695f5
--- /dev/null
+++ b/youtube_dl/extractor/libsyn.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class LibsynIE(InfoExtractor):
+ _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
+
+ _TESTS = [{
+ 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/',
+ 'md5': '443360ee1b58007bc3dcf09b41d093bb',
+ 'info_dict': {
+ 'id': '3377616',
+ 'ext': 'mp3',
+ 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
+ 'description': 'md5:601cb790edd05908957dae8aaa866465',
+ 'upload_date': '20150220',
+ 'thumbnail': 're:^https?://.*',
+ },
+ }, {
+ 'url': 'https://html5-player.libsyn.com/embed/episode/id/3727166/height/75/width/200/theme/standard/direction/no/autoplay/no/autonext/no/thumbnail/no/preload/no/no_addthis/no/',
+ 'md5': '6c5cb21acd622d754d3b1a92b582ce42',
+ 'info_dict': {
+ 'id': '3727166',
+ 'ext': 'mp3',
+ 'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career',
+ 'upload_date': '20150818',
+ 'thumbnail': 're:^https?://.*',
+ }
+ }]
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+ url = m.group('mainurl')
+ webpage = self._download_webpage(url, video_id)
+
+ formats = [{
+ 'url': media_url,
+ } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))]
+
+ podcast_title = self._search_regex(
+ r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None)
+ episode_title = self._search_regex(
+ r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title')
+
+ title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
+
+ description = self._html_search_regex(
+ r'<div id="info_text_body">(.+?)</div>', webpage,
+ 'description', default=None)
+ thumbnail = self._search_regex(
+ r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+ release_date = unified_strdate(self._search_regex(
+ r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': release_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
index 1dfe7f77f..f8cbca7b3 100644
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -4,8 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
+ determine_ext,
int_or_none,
+ remove_end,
unified_strdate,
ExtractorError,
)
@@ -14,9 +17,9 @@ from ..utils import (
class LifeNewsIE(InfoExtractor):
IE_NAME = 'lifenews'
IE_DESC = 'LIFE | NEWS'
- _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P<id>\d+)'
+ _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://lifenews.ru/news/126342',
'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
'info_dict': {
@@ -27,48 +30,139 @@ class LifeNewsIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
'upload_date': '20140130',
}
- }
+ }, {
+ # video in <iframe>
+ 'url': 'http://lifenews.ru/news/152125',
+ 'md5': '77d19a6f0886cd76bdbf44b4d971a273',
+ 'info_dict': {
+ 'id': '152125',
+ 'ext': 'mp4',
+ 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
+ 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
+ 'upload_date': '20150402',
+ }
+ }, {
+ 'url': 'http://lifenews.ru/news/153461',
+ 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+ 'info_dict': {
+ 'id': '153461',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'upload_date': '20150505',
+ }
+ }, {
+ 'url': 'http://lifenews.ru/video/13035',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ section = mobj.group('section')
- webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
+ webpage = self._download_webpage(
+ 'http://lifenews.ru/%s/%s' % (section, video_id),
+ video_id, 'Downloading page')
videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
- if not videos:
+ iframe_link = self._html_search_regex(
+ '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
+ if not videos and not iframe_link:
raise ExtractorError('No media links available for %s' % video_id)
- title = self._og_search_title(webpage)
- TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
- if title.endswith(TITLE_SUFFIX):
- title = title[:-len(TITLE_SUFFIX)]
+ title = remove_end(
+ self._og_search_title(webpage),
+ ' - Первый по срочным новостям — LIFE | NEWS')
description = self._og_search_description(webpage)
view_count = self._html_search_regex(
- r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
+ r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
comment_count = self._html_search_regex(
- r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
+ r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
+ webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex(
- r'<time datetime=\'([^\']+)\'>', webpage, 'upload date', fatal=False)
+ r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
if upload_date is not None:
upload_date = unified_strdate(upload_date)
+ common_info = {
+ 'description': description,
+ 'view_count': int_or_none(view_count),
+ 'comment_count': int_or_none(comment_count),
+ 'upload_date': upload_date,
+ }
+
def make_entry(video_id, media, video_number=None):
- return {
+ cur_info = dict(common_info)
+ cur_info.update({
'id': video_id,
'url': media[1],
'thumbnail': media[0],
'title': title if video_number is None else '%s-video%s' % (title, video_number),
- 'description': description,
- 'view_count': int_or_none(view_count),
- 'comment_count': int_or_none(comment_count),
- 'upload_date': upload_date,
- }
+ })
+ return cur_info
+
+ if iframe_link:
+ iframe_link = self._proto_relative_url(iframe_link, 'http:')
+ cur_info = dict(common_info)
+ cur_info.update({
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'title': title,
+ 'url': iframe_link,
+ })
+ return cur_info
if len(videos) == 1:
return make_entry(video_id, videos[0])
else:
return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+
+
+class LifeEmbedIE(InfoExtractor):
+ IE_NAME = 'life:embed'
+ _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})'
+
+ _TEST = {
+ 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291',
+ 'md5': 'b889715c9e49cb1981281d0e5458fbbe',
+ 'info_dict': {
+ 'id': 'e50c2dec2867350528e2574c899b8291',
+ 'ext': 'mp4',
+ 'title': 'e50c2dec2867350528e2574c899b8291',
+ 'thumbnail': 're:http://.*\.jpg',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ formats = []
+ for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage):
+ video_url = compat_urlparse.urljoin(url, video_url)
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='m3u8'))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': ext,
+ 'preference': 1,
+ })
+ self._sort_formats(formats)
+
+ thumbnail = self._search_regex(
+ r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None)
+
+ return {
+ 'id': video_id,
+ 'title': video_id,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
new file mode 100644
index 000000000..fb03dd527
--- /dev/null
+++ b/youtube_dl/extractor/limelight.py
@@ -0,0 +1,229 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+)
+
+
+class LimelightBaseIE(InfoExtractor):
+ _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
+ _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
+
+ def _call_playlist_service(self, item_id, method, fatal=True):
+ return self._download_json(
+ self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method),
+ item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal)
+
+ def _call_api(self, organization_id, item_id, method):
+ return self._download_json(
+ self._API_URL % (organization_id, self._API_PATH, item_id, method),
+ item_id, 'Downloading API %s JSON' % method)
+
+ def _extract(self, item_id, pc_method, mobile_method, meta_method):
+ pc = self._call_playlist_service(item_id, pc_method)
+ metadata = self._call_api(pc['orgId'], item_id, meta_method)
+ mobile = self._call_playlist_service(item_id, mobile_method, fatal=False)
+ return pc, mobile, metadata
+
+ def _extract_info(self, streams, mobile_urls, properties):
+ video_id = properties['media_id']
+ formats = []
+
+ for stream in streams:
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ if '.f4m' in stream_url:
+ formats.extend(self._extract_f4m_formats(stream_url, video_id))
+ else:
+ fmt = {
+ 'url': stream_url,
+ 'abr': float_or_none(stream.get('audioBitRate')),
+ 'vbr': float_or_none(stream.get('videoBitRate')),
+ 'fps': float_or_none(stream.get('videoFrameRate')),
+ 'width': int_or_none(stream.get('videoWidthInPixels')),
+ 'height': int_or_none(stream.get('videoHeightInPixels')),
+ 'ext': determine_ext(stream_url)
+ }
+ rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url)
+ if rtmp:
+ format_id = 'rtmp'
+ if stream.get('videoBitRate'):
+ format_id += '-%d' % int_or_none(stream['videoBitRate'])
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ 'format_id': format_id,
+ })
+ formats.append(fmt)
+
+ for mobile_url in mobile_urls:
+ media_url = mobile_url.get('mobileUrl')
+ if not media_url:
+ continue
+ format_id = mobile_url.get('targetMediaPlatform')
+ if determine_ext(media_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ preference=-1, m3u8_id=format_id))
+ else:
+ formats.append({
+ 'url': media_url,
+ 'format_id': format_id,
+ 'preference': -1,
+ })
+
+ self._sort_formats(formats)
+
+ title = properties['title']
+ description = properties.get('description')
+ timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date'))
+ duration = float_or_none(properties.get('duration_in_milliseconds'), 1000)
+ filesize = int_or_none(properties.get('total_storage_in_bytes'))
+ categories = [properties.get('category')]
+ tags = properties.get('tags', [])
+ thumbnails = [{
+ 'url': thumbnail['url'],
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')]
+
+ subtitles = {}
+ for caption in properties.get('captions', {}):
+ lang = caption.get('language_code')
+ subtitles_url = caption.get('url')
+ if lang and subtitles_url:
+ subtitles[lang] = [{
+ 'url': subtitles_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'filesize': filesize,
+ 'categories': categories,
+ 'tags': tags,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
+ }
+
+
+class LimelightMediaIE(LimelightBaseIE):
+ IE_NAME = 'limelight'
+ _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})'
+ _TESTS = [{
+ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86',
+ 'info_dict': {
+ 'id': '3ffd040b522b4485b6d84effc750cd86',
+ 'ext': 'flv',
+ 'title': 'HaP and the HB Prince Trailer',
+ 'description': 'md5:8005b944181778e313d95c1237ddb640',
+ 'thumbnail': 're:^https?://.*\.jpeg$',
+ 'duration': 144.23,
+ 'timestamp': 1244136834,
+ 'upload_date': '20090604',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # video with subtitles
+ 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335',
+ 'info_dict': {
+ 'id': 'a3e00274d4564ec4a9b29b9466432335',
+ 'ext': 'flv',
+ 'title': '3Play Media Overview Video',
+ 'description': '',
+ 'thumbnail': 're:^https?://.*\.jpeg$',
+ 'duration': 78.101,
+ 'timestamp': 1338929955,
+ 'upload_date': '20120605',
+ 'subtitles': 'mincount:9',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }]
+ _PLAYLIST_SERVICE_PATH = 'media'
+ _API_PATH = 'media'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ pc, mobile, metadata = self._extract(
+ video_id, 'getPlaylistByMediaId', 'getMobilePlaylistByMediaId', 'properties')
+
+ return self._extract_info(
+ pc['playlistItems'][0].get('streams', []),
+ mobile['mediaList'][0].get('mobileUrls', []) if mobile else [],
+ metadata)
+
+
+class LimelightChannelIE(LimelightBaseIE):
+ IE_NAME = 'limelight:channel'
+ _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})'
+ _TEST = {
+ 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082',
+ 'info_dict': {
+ 'id': 'ab6a524c379342f9b23642917020c082',
+ 'title': 'Javascript Sample Code',
+ },
+ 'playlist_mincount': 3,
+ }
+ _PLAYLIST_SERVICE_PATH = 'channel'
+ _API_PATH = 'channels'
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ pc, mobile, medias = self._extract(
+ channel_id, 'getPlaylistByChannelId',
+ 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', 'media')
+
+ entries = [
+ self._extract_info(
+ pc['playlistItems'][i].get('streams', []),
+ mobile['mediaList'][i].get('mobileUrls', []) if mobile else [],
+ medias['media_list'][i])
+ for i in range(len(medias['media_list']))]
+
+ return self.playlist_result(entries, channel_id, pc['title'])
+
+
+class LimelightChannelListIE(LimelightBaseIE):
+ IE_NAME = 'limelight:channel_list'
+ _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})'
+ _TEST = {
+ 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b',
+ 'info_dict': {
+ 'id': '301b117890c4465c8179ede21fd92e2b',
+ 'title': 'Website - Hero Player',
+ },
+ 'playlist_mincount': 2,
+ }
+ _PLAYLIST_SERVICE_PATH = 'channel_list'
+
+ def _real_extract(self, url):
+ channel_list_id = self._match_id(url)
+
+ channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById')
+
+ entries = [
+ self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel')
+ for channel in channel_list['channelList']]
+
+ return self.playlist_result(entries, channel_list_id, channel_list['title'])
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index 35822067f..857edfde2 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -40,6 +40,17 @@ class LiveLeakIE(InfoExtractor):
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
}
+ }, {
+ # Covers https://github.com/rg3/youtube-dl/pull/5983
+ 'url': 'http://www.liveleak.com/view?i=801_1409392012',
+ 'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+ 'info_dict': {
+ 'id': '801_1409392012',
+ 'ext': 'mp4',
+ 'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.",
+ 'uploader': 'bony333',
+ 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+ }
}]
def _real_extract(self, url):
@@ -85,7 +96,10 @@ class LiveLeakIE(InfoExtractor):
'url': s['file'],
} for i, s in enumerate(sources)]
for i, s in enumerate(sources):
- orig_url = s['file'].replace('.h264_base.mp4', '')
+ # Removing '.h264_*.mp4' gives the raw video, which is essentially
+ # the same video without the LiveLeak logo at the top (see
+ # https://github.com/rg3/youtube-dl/pull/4768)
+ orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
if s['file'] != orig_url:
formats.append({
'format_id': 'original-%s' % i,
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 5247c6f58..6d7733e41 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -2,6 +2,7 @@ from __future__ import unicode_literals
import re
import json
+import itertools
from .common import InfoExtractor
from ..compat import (
@@ -20,7 +21,7 @@ from ..utils import (
class LivestreamIE(InfoExtractor):
IE_NAME = 'livestream'
- _VALID_URL = r'https?://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>[0-9]+)(?:/player)?)?/?(?:$|[?#])'
+ _VALID_URL = r'https?://(?:new\.)?livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>[0-9]+)(?:/player)?)?/?(?:$|[?#])'
_TESTS = [{
'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
'md5': '53274c76ba7754fb0e8d072716f2292b',
@@ -37,11 +38,22 @@ class LivestreamIE(InfoExtractor):
'url': 'http://new.livestream.com/tedx/cityenglish',
'info_dict': {
'title': 'TEDCity2.0 (English)',
+ 'id': '2245590',
},
'playlist_mincount': 4,
}, {
+ 'url': 'http://new.livestream.com/chess24/tatasteelchess',
+ 'info_dict': {
+ 'title': 'Tata Steel Chess',
+ 'id': '3705884',
+ },
+ 'playlist_mincount': 60,
+ }, {
'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640',
'only_matching': True,
+ }, {
+ 'url': 'http://livestream.com/bsww/concacafbeachsoccercampeonato2015',
+ 'only_matching': True,
}]
def _parse_smil(self, video_id, smil_url):
@@ -116,6 +128,30 @@ class LivestreamIE(InfoExtractor):
'view_count': video_data.get('views'),
}
+ def _extract_event(self, info):
+ event_id = compat_str(info['id'])
+ account = compat_str(info['owner_account_id'])
+ root_url = (
+ 'https://new.livestream.com/api/accounts/{account}/events/{event}/'
+ 'feed.json'.format(account=account, event=event_id))
+
+ def _extract_videos():
+ last_video = None
+ for i in itertools.count(1):
+ if last_video is None:
+ info_url = root_url
+ else:
+ info_url = '{root}?&id={id}&newer=-1&type=video'.format(
+ root=root_url, id=last_video)
+ videos_info = self._download_json(info_url, event_id, 'Downloading page {0}'.format(i))['data']
+ videos_info = [v['data'] for v in videos_info if v['type'] == 'video']
+ if not videos_info:
+ break
+ for v in videos_info:
+ yield self._extract_video_info(v)
+ last_video = videos_info[-1]['id']
+ return self.playlist_result(_extract_videos(), event_id, info['full_name'])
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
@@ -143,13 +179,13 @@ class LivestreamIE(InfoExtractor):
result = result and compat_str(vdata['data']['id']) == vid
return result
- videos = [self._extract_video_info(video_data['data'])
- for video_data in info['feed']['data']
- if is_relevant(video_data, video_id)]
if video_id is None:
# This is an event page:
- return self.playlist_result(videos, info['id'], info['full_name'])
+ return self._extract_event(info)
else:
+ videos = [self._extract_video_info(video_data['data'])
+ for video_data in info['feed']['data']
+ if is_relevant(video_data, video_id)]
if not videos:
raise ExtractorError('Cannot find video %s' % video_id)
return videos[0]
@@ -158,23 +194,19 @@ class LivestreamIE(InfoExtractor):
# The original version of Livestream uses a different system
class LivestreamOriginalIE(InfoExtractor):
IE_NAME = 'livestream:original'
- _VALID_URL = r'''(?x)https?://www\.livestream\.com/
+ _VALID_URL = r'''(?x)https?://original\.livestream\.com/
(?P<user>[^/]+)/(?P<type>video|folder)
(?:\?.*?Id=|/)(?P<id>.*?)(&|$)
'''
_TESTS = [{
- 'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+ 'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
'info_dict': {
'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
},
- 'params': {
- # rtmp
- 'skip_download': True,
- },
}, {
- 'url': 'https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
+ 'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
'info_dict': {
'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3',
},
@@ -185,19 +217,17 @@ class LivestreamOriginalIE(InfoExtractor):
api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
info = self._download_xml(api_url, video_id)
+ # this url is used on mobile devices
+ stream_url = 'http://x{0}x.api.channel.livestream.com/3.0/getstream.json?id={1}'.format(user, video_id)
+ stream_info = self._download_json(stream_url, video_id)
item = info.find('channel').find('item')
ns = {'media': 'http://search.yahoo.com/mrss'}
thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']
- # Remove the extension and number from the path (like 1.jpg)
- path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, 'path')
return {
'id': video_id,
'title': item.find('title').text,
- 'url': 'rtmp://extondemand.livestream.com/ondemand',
- 'play_path': 'trans/dv15/mogulus-{0}'.format(path),
- 'player_url': 'http://static.livestream.com/chromelessPlayer/v21/playerapi.swf?hash=5uetk&v=0803&classid=D27CDB6E-AE6D-11cf-96B8-444553540000&jsEnabled=false&wmode=opaque',
- 'ext': 'flv',
+ 'url': stream_info['progressiveUrl'],
'thumbnail': thumbnail_url,
}
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
index 9c2fbdd96..e3236f7b5 100644
--- a/youtube_dl/extractor/lrt.py
+++ b/youtube_dl/extractor/lrt.py
@@ -52,6 +52,7 @@ class LRTIE(InfoExtractor):
'url': data['streamer'],
'play_path': 'mp4:%s' % data['file'],
'preference': -1,
+ 'rtmp_real_time': True,
})
else:
formats.extend(
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index 762cefa34..d4e1ae99d 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -3,32 +3,104 @@ from __future__ import unicode_literals
import re
import json
-from .subtitles import SubtitlesInfoExtractor
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urllib_parse,
- compat_urllib_request,
)
from ..utils import (
ExtractorError,
+ clean_html,
int_or_none,
+ sanitized_Request,
)
-class LyndaIE(SubtitlesInfoExtractor):
+class LyndaBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
+ _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
+ _NETRC_MACHINE = 'lynda'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'username': username.encode('utf-8'),
+ 'password': password.encode('utf-8'),
+ 'remember': 'false',
+ 'stayPut': 'false'
+ }
+ request = sanitized_Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ login_page = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ # Not (yet) logged in
+ m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page)
+ if m is not None:
+ response = m.group('json')
+ response_json = json.loads(response)
+ state = response_json['state']
+
+ if state == 'notlogged':
+ raise ExtractorError(
+ 'Unable to login, incorrect username and/or password',
+ expected=True)
+
+ # This is when we get popup:
+ # > You're already logged in to lynda.com on two devices.
+ # > If you log in here, we'll log you out of another device.
+ # So, we need to confirm this.
+ if state == 'conflicted':
+ confirm_form = {
+ 'username': '',
+ 'password': '',
+ 'resolve': 'true',
+ 'remember': 'false',
+ 'stayPut': 'false',
+ }
+ request = sanitized_Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8'))
+ login_page = self._download_webpage(
+ request, None,
+ 'Confirming log in and log out from another device')
+
+ if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
+ if 'login error' in login_page:
+ mobj = re.search(
+ r'(?s)<h1[^>]+class="topmost">(?P<title>[^<]+)</h1>\s*<div>(?P<description>.+?)</div>',
+ login_page)
+ if mobj:
+ raise ExtractorError(
+ 'lynda returned error: %s - %s'
+ % (mobj.group('title'), clean_html(mobj.group('description'))),
+ expected=True)
+ raise ExtractorError('Unable to log in')
+
+ def _logout(self):
+ username, _ = self._get_login_info()
+ if username is None:
+ return
+
+ self._download_webpage(
+ 'http://www.lynda.com/ajax/logout.aspx', None,
+ 'Logging out', 'Unable to log out', fatal=False)
+
+
+class LyndaIE(LyndaBaseIE):
IE_NAME = 'lynda'
IE_DESC = 'lynda.com videos'
- _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html'
- _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
+ _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P<id>\d+)'
_NETRC_MACHINE = 'lynda'
- _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
_TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
- ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
-
- _TEST = {
+ _TESTS = [{
'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
'info_dict': {
@@ -37,62 +109,55 @@ class LyndaIE(SubtitlesInfoExtractor):
'title': 'Using the exercise files',
'duration': 68
}
- }
-
- def _real_initialize(self):
- self._login()
+ }, {
+ 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = self._match_id(url)
- page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
- 'Downloading video JSON')
- video_json = json.loads(page)
+ video = self._download_json(
+ 'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id,
+ video_id, 'Downloading video JSON')
- if 'Status' in video_json:
- raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True)
-
- if video_json['HasAccess'] is False:
+ if 'Status' in video:
raise ExtractorError(
- 'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
+ 'lynda returned error: %s' % video['Message'], expected=True)
+
+ if video.get('HasAccess') is False:
+ self.raise_login_required('Video %s is only available for members' % video_id)
- video_id = compat_str(video_json['ID'])
- duration = video_json['DurationInSeconds']
- title = video_json['Title']
+ video_id = compat_str(video.get('ID') or video_id)
+ duration = int_or_none(video.get('DurationInSeconds'))
+ title = video['Title']
formats = []
- fmts = video_json.get('Formats')
+ fmts = video.get('Formats')
if fmts:
- formats.extend([
- {
- 'url': fmt['Url'],
- 'ext': fmt['Extension'],
- 'width': fmt['Width'],
- 'height': fmt['Height'],
- 'filesize': fmt['FileSize'],
- 'format_id': str(fmt['Resolution'])
- } for fmt in fmts])
-
- prioritized_streams = video_json.get('PrioritizedStreams')
+ formats.extend([{
+ 'url': f['Url'],
+ 'ext': f.get('Extension'),
+ 'width': int_or_none(f.get('Width')),
+ 'height': int_or_none(f.get('Height')),
+ 'filesize': int_or_none(f.get('FileSize')),
+ 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None,
+ } for f in fmts if f.get('Url')])
+
+ prioritized_streams = video.get('PrioritizedStreams')
if prioritized_streams:
- formats.extend([
- {
+ for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
+ formats.extend([{
'url': video_url,
'width': int_or_none(format_id),
- 'format_id': format_id,
- } for format_id, video_url in prioritized_streams['0'].items()
- ])
+ 'format_id': '%s-%s' % (prioritized_stream_id, format_id),
+ } for format_id, video_url in prioritized_stream.items()])
self._check_formats(formats, video_id)
self._sort_formats(formats)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, page)
- return
-
- subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page))
+ subtitles = self.extract_subtitles(video_id)
return {
'id': video_id,
@@ -102,83 +167,37 @@ class LyndaIE(SubtitlesInfoExtractor):
'formats': formats
}
- def _login(self):
- (username, password) = self._get_login_info()
- if username is None:
- return
-
- login_form = {
- 'username': username,
- 'password': password,
- 'remember': 'false',
- 'stayPut': 'false'
- }
- request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
- login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
-
- # Not (yet) logged in
- m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
- if m is not None:
- response = m.group('json')
- response_json = json.loads(response)
- state = response_json['state']
-
- if state == 'notlogged':
- raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
-
- # This is when we get popup:
- # > You're already logged in to lynda.com on two devices.
- # > If you log in here, we'll log you out of another device.
- # So, we need to confirm this.
- if state == 'conflicted':
- confirm_form = {
- 'username': '',
- 'password': '',
- 'resolve': 'true',
- 'remember': 'false',
- 'stayPut': 'false',
- }
- request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
- login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device')
-
- if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
- raise ExtractorError('Unable to log in')
-
- def _fix_subtitles(self, subtitles):
- if subtitles is None:
- return subtitles # subtitles not requested
-
- fixed_subtitles = {}
- for k, v in subtitles.items():
- subs = json.loads(v)
- if len(subs) == 0:
+ def _fix_subtitles(self, subs):
+ srt = ''
+ seq_counter = 0
+ for pos in range(0, len(subs) - 1):
+ seq_current = subs[pos]
+ m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
+ if m_current is None:
continue
- srt = ''
- for pos in range(0, len(subs) - 1):
- seq_current = subs[pos]
- m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
- if m_current is None:
- continue
- seq_next = subs[pos + 1]
- m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
- if m_next is None:
- continue
- appear_time = m_current.group('timecode')
- disappear_time = m_next.group('timecode')
- text = seq_current['Caption']
- srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text)
- if srt:
- fixed_subtitles[k] = srt
- return fixed_subtitles
-
- def _get_available_subtitles(self, video_id, webpage):
+ seq_next = subs[pos + 1]
+ m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
+ if m_next is None:
+ continue
+ appear_time = m_current.group('timecode')
+ disappear_time = m_next.group('timecode')
+ text = seq_current['Caption'].strip()
+ if text:
+ seq_counter += 1
+ srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
+ if srt:
+ return srt
+
+ def _get_subtitles(self, video_id):
url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
- sub = self._download_webpage(url, None, False)
- sub_json = json.loads(sub)
- return {'en': url} if len(sub_json) > 0 else {}
+ subs = self._download_json(url, None, False)
+ if subs:
+ return {'en': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]}
+ else:
+ return {}
-class LyndaCourseIE(InfoExtractor):
+class LyndaCourseIE(LyndaBaseIE):
IE_NAME = 'lynda:course'
IE_DESC = 'lynda.com online courses'
@@ -191,37 +210,41 @@ class LyndaCourseIE(InfoExtractor):
course_path = mobj.group('coursepath')
course_id = mobj.group('courseid')
- page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
- course_id, 'Downloading course JSON')
- course_json = json.loads(page)
+ course = self._download_json(
+ 'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
+ course_id, 'Downloading course JSON')
- if 'Status' in course_json and course_json['Status'] == 'NotFound':
- raise ExtractorError('Course %s does not exist' % course_id, expected=True)
+ self._logout()
+
+ if course.get('Status') == 'NotFound':
+ raise ExtractorError(
+ 'Course %s does not exist' % course_id, expected=True)
unaccessible_videos = 0
videos = []
- (username, _) = self._get_login_info()
# Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
# by single video API anymore
- for chapter in course_json['Chapters']:
- for video in chapter['Videos']:
- if username is None and video['HasAccess'] is False:
+ for chapter in course['Chapters']:
+ for video in chapter.get('Videos', []):
+ if video.get('HasAccess') is False:
unaccessible_videos += 1
continue
- videos.append(video['ID'])
+ if video.get('ID'):
+ videos.append(video['ID'])
if unaccessible_videos > 0:
- self._downloader.report_warning('%s videos are only available for members and will not be downloaded. '
- % unaccessible_videos + LyndaIE.ACCOUNT_CREDENTIALS_HINT)
+ self._downloader.report_warning(
+ '%s videos are only available for members (or paid members) and will not be downloaded. '
+ % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)
entries = [
- self.url_result('http://www.lynda.com/%s/%s-4.html' %
- (course_path, video_id),
- 'Lynda')
+ self.url_result(
+ 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id),
+ 'Lynda')
for video_id in videos]
- course_title = course_json['Title']
+ course_title = course.get('Title')
return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py
index 54a14cb94..ab1300185 100644
--- a/youtube_dl/extractor/mailru.py
+++ b/youtube_dl/extractor/mailru.py
@@ -25,6 +25,7 @@ class MailRuIE(InfoExtractor):
'uploader_id': 'sonypicturesrus@mail.ru',
'duration': 184,
},
+ 'skip': 'Not accessible from Travis CI server',
},
{
'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
@@ -39,6 +40,7 @@ class MailRuIE(InfoExtractor):
'uploader_id': 'hitech@corp.mail.ru',
'duration': 245,
},
+ 'skip': 'Not accessible from Travis CI server',
},
]
diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py
index 0b85a59d1..92511a671 100644
--- a/youtube_dl/extractor/malemotion.py
+++ b/youtube_dl/extractor/malemotion.py
@@ -2,9 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
class MalemotionIE(InfoExtractor):
@@ -24,7 +22,7 @@ class MalemotionIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_url = compat_urllib_parse.unquote(self._search_regex(
+ video_url = compat_urllib_parse_unquote(self._search_regex(
r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
video_title = self._html_search_regex(
r'<title>(.*?)</title', webpage, 'title')
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 5fdd19027..88334889e 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -1,64 +1,169 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ xpath_text,
+)
class MDRIE(InfoExtractor):
- _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
+ IE_DESC = 'MDR.DE and KiKA'
+ _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'
- # No tests, MDR regularily deletes its videos
- _TEST = {
+ _TESTS = [{
+ # MDR regularily deletes its videos
'url': 'http://www.mdr.de/fakt/video189002.html',
'only_matching': True,
- }
+ }, {
+ # audio
+ 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html',
+ 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa',
+ 'info_dict': {
+ 'id': '1312272',
+ 'ext': 'mp3',
+ 'title': 'Feuilleton vom 30. Oktober 2015',
+ 'duration': 250,
+ 'uploader': 'MITTELDEUTSCHER RUNDFUNK',
+ },
+ }, {
+ 'url': 'http://www.kika.de/baumhaus/videos/video19636.html',
+ 'md5': '4930515e36b06c111213e80d1e4aad0e',
+ 'info_dict': {
+ 'id': '19636',
+ 'ext': 'mp4',
+ 'title': 'Baumhaus vom 30. Oktober 2015',
+ 'duration': 134,
+ 'uploader': 'KIKA',
+ },
+ }, {
+ 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
+ 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
+ 'info_dict': {
+ 'id': '8182',
+ 'ext': 'mp4',
+ 'title': 'Beutolomäus und der geheime Weihnachtswunsch',
+ 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
+ 'timestamp': 1419047100,
+ 'upload_date': '20141220',
+ 'duration': 4628,
+ 'uploader': 'KIKA',
+ },
+ }, {
+ 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('video_id')
- domain = m.group('domain')
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ data_url = self._search_regex(
+ r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1',
+ webpage, 'data url', group='url')
- # determine title and media streams from webpage
- html = self._download_webpage(url, video_id)
+ doc = self._download_xml(
+ compat_urlparse.urljoin(url, data_url), video_id)
- title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
- xmlurl = self._search_regex(
- r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
+ title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True)
- doc = self._download_xml(domain + xmlurl, video_id)
formats = []
- for a in doc.findall('./assets/asset'):
- url_el = a.find('.//progressiveDownloadUrl')
- if url_el is None:
- continue
- abr = int(a.find('bitrateAudio').text) // 1000
- media_type = a.find('mediaType').text
- format = {
- 'abr': abr,
- 'filesize': int(a.find('fileSize').text),
- 'url': url_el.text,
- }
-
- vbr_el = a.find('bitrateVideo')
- if vbr_el is None:
- format.update({
- 'vcodec': 'none',
- 'format_id': '%s-%d' % (media_type, abr),
- })
- else:
- vbr = int(vbr_el.text) // 1000
- format.update({
- 'vbr': vbr,
- 'width': int(a.find('frameWidth').text),
- 'height': int(a.find('frameHeight').text),
- 'format_id': '%s-%d' % (media_type, vbr),
- })
- formats.append(format)
+ processed_urls = []
+ for asset in doc.findall('./assets/asset'):
+ for source in (
+ 'progressiveDownload',
+ 'dynamicHttpStreamingRedirector',
+ 'adaptiveHttpStreamingRedirector'):
+ url_el = asset.find('./%sUrl' % source)
+ if url_el is None:
+ continue
+
+ video_url = url_el.text
+ if video_url in processed_urls:
+ continue
+
+ processed_urls.append(video_url)
+
+ vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
+ abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
+
+ ext = determine_ext(url_el.text)
+ if ext == 'm3u8':
+ url_formats = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ preference=0, m3u8_id='HLS', fatal=False)
+ elif ext == 'f4m':
+ url_formats = self._extract_f4m_formats(
+ video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id,
+ preference=0, f4m_id='HDS', fatal=False)
+ else:
+ media_type = xpath_text(asset, './mediaType', 'media type', default='MP4')
+ vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
+ abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
+ filesize = int_or_none(xpath_text(asset, './fileSize', 'file size'))
+
+ f = {
+ 'url': video_url,
+ 'format_id': '%s-%d' % (media_type, vbr or abr),
+ 'filesize': filesize,
+ 'abr': abr,
+ 'preference': 1,
+ }
+
+ if vbr:
+ width = int_or_none(xpath_text(asset, './frameWidth', 'width'))
+ height = int_or_none(xpath_text(asset, './frameHeight', 'height'))
+ f.update({
+ 'vbr': vbr,
+ 'width': width,
+ 'height': height,
+ })
+
+ url_formats = [f]
+
+ if not url_formats:
+ continue
+
+ if not vbr:
+ for f in url_formats:
+ abr = f.get('tbr') or abr
+ if 'tbr' in f:
+ del f['tbr']
+ f.update({
+ 'abr': abr,
+ 'vcodec': 'none',
+ })
+
+ formats.extend(url_formats)
+
self._sort_formats(formats)
+ description = xpath_text(doc, './broadcast/broadcastDescription', 'description')
+ timestamp = parse_iso8601(
+ xpath_text(
+ doc, [
+ './broadcast/broadcastDate',
+ './broadcast/broadcastStartDate',
+ './broadcast/broadcastEndDate'],
+ 'timestamp', default=None))
+ duration = parse_duration(xpath_text(doc, './duration', 'duration'))
+ uploader = xpath_text(doc, './rights', 'uploader')
+
return {
'id': video_id,
'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'uploader': uploader,
'formats': formats,
}
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 8bc333b02..3c786a36d 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -6,12 +6,13 @@ from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse,
- compat_urllib_request,
+ compat_urllib_parse_unquote,
)
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
+ sanitized_Request,
)
@@ -116,7 +117,7 @@ class MetacafeIE(InfoExtractor):
'filters': '0',
'submit': "Continue - I'm over 18",
}
- request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
+ request = sanitized_Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.report_age_confirmation()
self._download_webpage(request, None, False, 'Unable to confirm age')
@@ -141,7 +142,7 @@ class MetacafeIE(InfoExtractor):
return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
# Retrieve video webpage to extract further information
- req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
+ req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id)
# AnyClip videos require the flashversion cookie so that we get the link
# to the mp4 file
@@ -155,7 +156,7 @@ class MetacafeIE(InfoExtractor):
video_url = None
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
if mobj is not None:
- mediaURL = compat_urllib_parse.unquote(mobj.group(1))
+ mediaURL = compat_urllib_parse_unquote(mobj.group(1))
video_ext = mediaURL[-3:]
# Extract gdaKey if available
diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py
index 14934b7ec..e46b23a6f 100644
--- a/youtube_dl/extractor/minhateca.py
+++ b/youtube_dl/extractor/minhateca.py
@@ -2,14 +2,12 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
int_or_none,
parse_duration,
parse_filesize,
+ sanitized_Request,
)
@@ -39,7 +37,7 @@ class MinhatecaIE(InfoExtractor):
('fileId', video_id),
('__RequestVerificationToken', token),
]
- req = compat_urllib_request.Request(
+ req = sanitized_Request(
'http://minhateca.com.br/action/License/Download',
data=compat_urllib_parse.urlencode(token_data))
req.add_header('Content-Type', 'application/x-www-form-urlencoded')
diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py
new file mode 100644
index 000000000..170ebd9eb
--- /dev/null
+++ b/youtube_dl/extractor/miomio.py
@@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+
+from .common import InfoExtractor
+from ..utils import (
+ xpath_text,
+ int_or_none,
+ ExtractorError,
+ sanitized_Request,
+)
+
+
+class MioMioIE(InfoExtractor):
+ IE_NAME = 'miomio.tv'
+ _VALID_URL = r'https?://(?:www\.)?miomio\.tv/watch/cc(?P<id>[0-9]+)'
+ _TESTS = [{
+ # "type=video" in flashvars
+ 'url': 'http://www.miomio.tv/watch/cc88912/',
+ 'md5': '317a5f7f6b544ce8419b784ca8edae65',
+ 'info_dict': {
+ 'id': '88912',
+ 'ext': 'flv',
+ 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕',
+ 'duration': 5923,
+ },
+ }, {
+ 'url': 'http://www.miomio.tv/watch/cc184024/',
+ 'info_dict': {
+ 'id': '43729',
+ 'title': '《动漫同人插画绘制》',
+ },
+ 'playlist_mincount': 86,
+ 'skip': 'This video takes time too long for retrieving the URL',
+ }, {
+ 'url': 'http://www.miomio.tv/watch/cc173113/',
+ 'info_dict': {
+ 'id': '173113',
+ 'title': 'The New Macbook 2015 上手试玩与简评'
+ },
+ 'playlist_mincount': 2,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(
+ 'description', webpage, 'title', fatal=True)
+
+ mioplayer_path = self._search_regex(
+ r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path')
+
+ http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path}
+
+ xml_config = self._search_regex(
+ r'flashvars="type=(?:sina|video)&amp;(.+?)&amp;',
+ webpage, 'xml config')
+
+ # skipping the following page causes lags and eventually connection drop-outs
+ self._request_webpage(
+ 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)),
+ video_id)
+
+ vid_config_request = sanitized_Request(
+ 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config),
+ headers=http_headers)
+
+ # the following xml contains the actual configuration information on the video file(s)
+ vid_config = self._download_xml(vid_config_request, video_id)
+
+ if not int_or_none(xpath_text(vid_config, 'timelength')):
+ raise ExtractorError('Unable to load videos!', expected=True)
+
+ entries = []
+ for f in vid_config.findall('./durl'):
+ segment_url = xpath_text(f, 'url', 'video url')
+ if not segment_url:
+ continue
+ order = xpath_text(f, 'order', 'order')
+ segment_id = video_id
+ segment_title = title
+ if order:
+ segment_id += '-%s' % order
+ segment_title += ' part %s' % order
+ entries.append({
+ 'id': segment_id,
+ 'url': segment_url,
+ 'title': segment_title,
+ 'duration': int_or_none(xpath_text(f, 'length', 'duration'), 1000),
+ 'http_headers': http_headers,
+ })
+
+ if len(entries) == 1:
+ segment = entries[0]
+ segment['id'] = video_id
+ segment['title'] = title
+ return segment
+
+ return {
+ '_type': 'multi_video',
+ 'id': video_id,
+ 'entries': entries,
+ 'title': title,
+ 'http_headers': http_headers,
+ }
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 3c61a850f..29ca45778 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -5,9 +5,6 @@ import json
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..compat import (
- compat_urlparse,
-)
from ..utils import (
clean_html,
ExtractorError,
@@ -21,12 +18,12 @@ class TechTVMITIE(InfoExtractor):
_TEST = {
'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
- 'md5': '1f8cb3e170d41fd74add04d3c9330e5f',
+ 'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7',
'info_dict': {
'id': '25418',
'ext': 'mp4',
- 'title': 'MIT DNA Learning Center Set',
- 'description': 'md5:82313335e8a8a3f243351ba55bc1b474',
+ 'title': 'MIT DNA and Protein Sets',
+ 'description': 'md5:46f5c69ce434f0a97e7c628cc142802d',
},
}
@@ -36,8 +33,8 @@ class TechTVMITIE(InfoExtractor):
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
- base_url = self._search_regex(
- r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url')
+ base_url = self._proto_relative_url(self._search_regex(
+ r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:')
formats_json = self._search_regex(
r'bitrates: (\[.+?\])', raw_page, 'video formats')
formats_mit = json.loads(formats_json)
@@ -89,7 +86,7 @@ class MITIE(TechTVMITIE):
webpage = self._download_webpage(url, page_title)
embed_url = self._search_regex(
r'<iframe .*?src="(.+?)"', webpage, 'embed url')
- return self.url_result(embed_url, ie='TechTVMIT')
+ return self.url_result(embed_url)
class OCWMITIE(InfoExtractor):
@@ -108,7 +105,6 @@ class OCWMITIE(InfoExtractor):
'upload_date': '20121109',
'uploader_id': 'MIT',
'uploader': 'MIT OpenCourseWare',
- # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
}
},
{
@@ -121,7 +117,6 @@ class OCWMITIE(InfoExtractor):
'uploader_id': 'MIT',
'uploader': 'MIT OpenCourseWare',
'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
- # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
}
}
]
@@ -140,7 +135,6 @@ class OCWMITIE(InfoExtractor):
metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
metadata = re.split(r', ?', metadata)
yt = metadata[1]
- subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
else:
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
@@ -148,7 +142,6 @@ class OCWMITIE(InfoExtractor):
metadata = re.sub(r'[\'"]', '', embed_media.group(1))
metadata = re.split(r', ?', metadata)
yt = metadata[1]
- subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
else:
raise ExtractorError('Unable to find embedded YouTube video.')
video_id = YoutubeIE.extract_id(yt)
@@ -159,7 +152,5 @@ class OCWMITIE(InfoExtractor):
'title': title,
'description': description,
'url': yt,
- 'url_transparent'
- 'subtitles': subs,
'ie_key': 'Youtube',
}
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 256758323..c595f2077 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -1,68 +1,89 @@
from __future__ import unicode_literals
-import json
-
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
compat_urlparse,
)
from ..utils import (
+ encode_dict,
get_element_by_attribute,
- parse_duration,
- strip_jsonp,
+ int_or_none,
)
class MiTeleIE(InfoExtractor):
- IE_NAME = 'mitele.es'
+ IE_DESC = 'mitele.es'
_VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
- 'md5': '6a75fe9d0d3275bead0cb683c616fddb',
+ 'md5': '0ff1a13aebb35d9bc14081ff633dd324',
'info_dict': {
- 'id': '0fce117d',
- 'ext': 'mp4',
- 'title': 'Programa 144 - Tor, la web invisible',
- 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+ 'id': '0NF1jJnxS1Wu3pHrmvFyw2',
'display_id': 'programa-144',
+ 'ext': 'flv',
+ 'title': 'Tor, la web invisible',
+ 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+ 'thumbnail': 're:(?i)^https?://.*\.jpg$',
'duration': 2913,
},
- }
+ }]
def _real_extract(self, url):
- episode = self._match_id(url)
- webpage = self._download_webpage(url, episode)
- embed_data_json = self._search_regex(
- r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
- ).replace('\'', '"')
- embed_data = json.loads(embed_data_json)
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ config_url = self._search_regex(
+ r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url')
+ config_url = compat_urlparse.urljoin(url, config_url)
+
+ config = self._download_json(
+ config_url, display_id, 'Downloading config JSON')
+
+ mmc = self._download_json(
+ config['services']['mmc'], display_id, 'Downloading mmc JSON')
+
+ formats = []
+ for location in mmc['locations']:
+ gat = self._proto_relative_url(location.get('gat'), 'http:')
+ bas = location.get('bas')
+ loc = location.get('loc')
+ ogn = location.get('ogn')
+ if None in (gat, bas, loc, ogn):
+ continue
+ token_data = {
+ 'bas': bas,
+ 'icd': loc,
+ 'ogn': ogn,
+ 'sta': '0',
+ }
+ media = self._download_json(
+ '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data))),
+ display_id, 'Downloading %s JSON' % location['loc'])
+ file_ = media.get('file')
+ if not file_:
+ continue
+ formats.extend(self._extract_f4m_formats(
+ file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+ display_id, f4m_id=loc))
- domain = embed_data['mediaUrl']
- if not domain.startswith('http'):
- # only happens in telecinco.es videos
- domain = 'http://' + domain
- info_url = compat_urlparse.urljoin(
- domain,
- compat_urllib_parse.unquote(embed_data['flashvars']['host'])
- )
- info_el = self._download_xml(info_url, episode).find('./video/info')
+ title = self._search_regex(
+ r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title')
- video_link = info_el.find('videoUrl/link').text
- token_query = compat_urllib_parse.urlencode({'id': video_link})
- token_info = self._download_json(
- embed_data['flashvars']['ov_tk'] + '?' + token_query,
- episode,
- transform_source=strip_jsonp
- )
+ video_id = self._search_regex(
+ r'data-media-id\s*=\s*"([^"]+)"', webpage,
+ 'data media id', default=None) or display_id
+ thumbnail = config.get('poster', {}).get('imageUrl')
+ duration = int_or_none(mmc.get('duration'))
return {
- 'id': embed_data['videoId'],
- 'display_id': episode,
- 'title': info_el.find('title').text,
- 'url': token_info['tokenizedUrl'],
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
'description': get_element_by_attribute('class', 'text', webpage),
- 'thumbnail': info_el.find('thumb').text,
- 'duration': parse_duration(info_el.find('duration').text),
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 1831c6749..d47aeceda 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -3,14 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
HEADRequest,
str_to_int,
- parse_iso8601,
)
@@ -27,8 +24,6 @@ class MixcloudIE(InfoExtractor):
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
'uploader_id': 'dholbach',
- 'upload_date': '20111115',
- 'timestamp': 1321359578,
'thumbnail': 're:https?://.*\.jpg',
'view_count': int,
'like_count': int,
@@ -37,55 +32,46 @@ class MixcloudIE(InfoExtractor):
'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
'info_dict': {
'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
- 'ext': 'm4a',
- 'title': 'Electric Relaxation vol. 3',
+ 'ext': 'mp3',
+ 'title': 'Caribou 7 inch Vinyl Mix & Chat',
'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
- 'uploader': 'Daniel Drumz',
+ 'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
- 'thumbnail': 're:https?://.*\.jpg',
+ 'thumbnail': 're:https?://.*/images/',
'view_count': int,
'like_count': int,
},
}]
- def _get_url(self, track_id, template_url):
- server_count = 30
- for i in range(server_count):
- url = template_url % i
- try:
- # We only want to know if the request succeed
- # don't download the whole file
- self._request_webpage(
- HEADRequest(url), track_id,
- 'Checking URL %d/%d ...' % (i + 1, server_count + 1))
- return url
- except ExtractorError:
- pass
-
- return None
+ def _check_url(self, url, track_id, ext):
+ try:
+ # We only want to know if the request succeed
+ # don't download the whole file
+ self._request_webpage(
+ HEADRequest(url), track_id,
+ 'Trying %s URL' % ext)
+ return True
+ except ExtractorError:
+ return False
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group(1)
cloudcast_name = mobj.group(2)
- track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
+ track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name)))
webpage = self._download_webpage(url, track_id)
preview_url = self._search_regex(
r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')
song_url = preview_url.replace('/previews/', '/c/originals/')
- template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
- final_song_url = self._get_url(track_id, template_url)
- if final_song_url is None:
- self.to_screen('Trying with m4a extension')
- template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
- final_song_url = self._get_url(track_id, template_url)
- if final_song_url is None:
- raise ExtractorError('Unable to extract track url')
+ if not self._check_url(song_url, track_id, 'mp3'):
+ song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
+ if not self._check_url(song_url, track_id, 'm4a'):
+ raise ExtractorError('Unable to extract track url')
PREFIX = (
- r'<span class="play-button[^"]*?"'
+ r'm-play-on-spacebar[^>]+'
r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
title = self._html_search_regex(
PREFIX + r'm-title="([^"]+)"', webpage, 'title')
@@ -99,26 +85,21 @@ class MixcloudIE(InfoExtractor):
r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
description = self._og_search_description(webpage)
like_count = str_to_int(self._search_regex(
- [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"',
- r'/favorites/?">([0-9]+)<'],
+ r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',
webpage, 'like count', fatal=False))
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>'],
webpage, 'play count', fatal=False))
- timestamp = parse_iso8601(self._search_regex(
- r'<time itemprop="dateCreated" datetime="([^"]+)">',
- webpage, 'upload date', default=None))
return {
'id': track_id,
'title': title,
- 'url': final_song_url,
+ 'url': song_url,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
- 'timestamp': timestamp,
'view_count': view_count,
'like_count': like_count,
}
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index 1a241aca7..e242b897f 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -10,7 +10,21 @@ from ..utils import (
class MLBIE(InfoExtractor):
- _VALID_URL = r'https?://m(?:lb)?\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:[\da-z_-]+\.)*mlb\.com/
+ (?:
+ (?:
+ (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+ (?:
+ shared/video/embed/(?:embed|m-internal-embed)\.html|
+ (?:[^/]+/)+(?:play|index)\.jsp|
+ )\?.*?\bcontent_id=
+ )
+ (?P<id>n?\d+)|
+ (?:[^/]+/)*(?P<path>[^/]+)
+ )
+ '''
_TESTS = [
{
'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea',
@@ -69,6 +83,18 @@ class MLBIE(InfoExtractor):
},
},
{
+ 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
+ 'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+ 'info_dict': {
+ 'id': '75609783',
+ 'ext': 'mp4',
+ 'title': 'Must C: Pillar climbs for catch',
+ 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
+ 'timestamp': 1429124820,
+ 'upload_date': '20150415',
+ }
+ },
+ {
'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
'only_matching': True,
},
@@ -80,12 +106,31 @@ class MLBIE(InfoExtractor):
'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553',
'only_matching': True,
},
+ {
+ 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728',
+ 'only_matching': True,
+ },
+ {
+ # From http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer
+ 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ if not video_id:
+ video_path = mobj.group('path')
+ webpage = self._download_webpage(url, video_path)
+ video_id = self._search_regex(
+ [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id')
+
detail = self._download_xml(
'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml'
% (video_id[-3], video_id[-2], video_id[-1], video_id), video_id)
diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py
index 5a66302f6..d930b9634 100644
--- a/youtube_dl/extractor/moevideo.py
+++ b/youtube_dl/extractor/moevideo.py
@@ -5,13 +5,11 @@ import json
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
int_or_none,
+ sanitized_Request,
)
@@ -80,7 +78,7 @@ class MoeVideoIE(InfoExtractor):
]
r_json = json.dumps(r)
post = compat_urllib_parse.urlencode({'r': r_json})
- req = compat_urllib_request.Request(self._API_URL, post)
+ req = sanitized_Request(self._API_URL, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
response = self._download_json(req, video_id)
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py
index 2cec12d35..f8226cbb2 100644
--- a/youtube_dl/extractor/mofosex.py
+++ b/youtube_dl/extractor/mofosex.py
@@ -5,10 +5,10 @@ import re
from .common import InfoExtractor
from ..compat import (
+ compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
- compat_urllib_request,
- compat_urllib_parse,
)
+from ..utils import sanitized_Request
class MofosexIE(InfoExtractor):
@@ -29,12 +29,12 @@ class MofosexIE(InfoExtractor):
video_id = mobj.group('id')
url = 'http://www.' + mobj.group('url')
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, 'title')
- video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
+ video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py
index 5de719bdc..f6bf94f2f 100644
--- a/youtube_dl/extractor/moniker.py
+++ b/youtube_dl/extractor/moniker.py
@@ -5,15 +5,17 @@ import os.path
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ remove_start,
+ sanitized_Request,
)
class MonikerIE(InfoExtractor):
IE_DESC = 'allmyvideos.net and vidspot.net'
- _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?:(?:2|v)/v-)?(?P<id>[a-zA-Z0-9_-]+)'
_TESTS = [{
'url': 'http://allmyvideos.net/jih3nce3x6wn',
@@ -24,6 +26,14 @@ class MonikerIE(InfoExtractor):
'title': 'youtube-dl test video',
},
}, {
+ 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn',
+ 'md5': '710883dee1bfc370ecf9fa6a89307c88',
+ 'info_dict': {
+ 'id': 'jih3nce3x6wn',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video',
+ },
+ }, {
'url': 'http://vidspot.net/l2ngsmhs8ci5',
'md5': '710883dee1bfc370ecf9fa6a89307c88',
'info_dict': {
@@ -34,24 +44,60 @@ class MonikerIE(InfoExtractor):
}, {
'url': 'https://www.vidspot.net/l2ngsmhs8ci5',
'only_matching': True,
+ }, {
+ 'url': 'http://vidspot.net/2/v-ywDf99',
+ 'md5': '5f8254ce12df30479428b0152fb8e7ba',
+ 'info_dict': {
+ 'id': 'ywDf99',
+ 'ext': 'mp4',
+ 'title': 'IL FAIT LE MALIN EN PORSHE CAYENNE ( mais pas pour longtemps)',
+ 'description': 'IL FAIT LE MALIN EN PORSHE CAYENNE.',
+ },
+ }, {
+ 'url': 'http://allmyvideos.net/v/v-HXZm5t',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ orig_video_id = self._match_id(url)
+ video_id = remove_start(orig_video_id, 'embed-')
+ url = url.replace(orig_video_id, video_id)
+ assert re.match(self._VALID_URL, url) is not None
orig_webpage = self._download_webpage(url, video_id)
- fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
- data = dict(fields)
+ if '>File Not Found<' in orig_webpage:
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- post = compat_urllib_parse.urlencode(data)
- headers = {
- b'Content-Type': b'application/x-www-form-urlencoded',
- }
- req = compat_urllib_request.Request(url, post, headers)
- webpage = self._download_webpage(
- req, video_id, note='Downloading video page ...')
+ error = self._search_regex(
+ r'class="err">([^<]+)<', orig_webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error), expected=True)
+
+ builtin_url = self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>.+?/builtin-.+?)\1',
+ orig_webpage, 'builtin URL', default=None, group='url')
+
+ if builtin_url:
+ req = sanitized_Request(builtin_url)
+ req.add_header('Referer', url)
+ webpage = self._download_webpage(req, video_id, 'Downloading builtin page')
+ title = self._og_search_title(orig_webpage).strip()
+ description = self._og_search_description(orig_webpage).strip()
+ else:
+ fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
+ data = dict(fields)
+
+ post = compat_urllib_parse.urlencode(data)
+ headers = {
+ b'Content-Type': b'application/x-www-form-urlencoded',
+ }
+ req = sanitized_Request(url, post, headers)
+ webpage = self._download_webpage(
+ req, video_id, note='Downloading video page ...')
- title = os.path.splitext(data['fname'])[0]
+ title = os.path.splitext(data['fname'])[0]
+ description = None
# Could be several links with different quality
links = re.findall(r'"file" : "?(.+?)",', webpage)
@@ -65,5 +111,6 @@ class MonikerIE(InfoExtractor):
return {
'id': video_id,
'title': title,
+ 'description': description,
'formats': formats,
}
diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py
index 7603af5e2..7cc7f054f 100644
--- a/youtube_dl/extractor/mooshare.py
+++ b/youtube_dl/extractor/mooshare.py
@@ -3,12 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
@@ -59,7 +57,7 @@ class MooshareIE(InfoExtractor):
'hash': hash_key,
}
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py
index 04e17d055..1564cb71f 100644
--- a/youtube_dl/extractor/movieclips.py
+++ b/youtube_dl/extractor/movieclips.py
@@ -1,80 +1,40 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..compat import (
- compat_str,
-)
-from ..utils import (
- ExtractorError,
- clean_html,
-)
+from ..utils import sanitized_Request
class MovieClipsIE(InfoExtractor):
- _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?'
+ _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P<id>[^/?#]+)'
_TEST = {
- 'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/',
+ 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5',
'info_dict': {
- 'id': 'Wy7ZU',
- 'display_id': 'my-week-with-marilyn-movie-do-you-love-me',
+ 'id': 'pKIGmG83AqD9',
+ 'display_id': 'warcraft-trailer-1-561180739597',
'ext': 'mp4',
- 'title': 'My Week with Marilyn - Do You Love Me?',
- 'description': 'md5:e86795bd332fe3cff461e7c8dc542acb',
+ 'title': 'Warcraft Trailer 1',
+ 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.',
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
+ 'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id')
- show_id = display_id or video_id
-
- config = self._download_xml(
- 'http://config.movieclips.com/player/config/%s' % video_id,
- show_id, 'Downloading player config')
-
- if config.find('./country-region').text == 'false':
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True)
-
- properties = config.find('./video/properties')
- smil_file = properties.attrib['smil_file']
+ display_id = self._match_id(url)
- smil = self._download_xml(smil_file, show_id, 'Downloading SMIL')
- base_url = smil.find('./head/meta').attrib['base']
-
- formats = []
- for video in smil.findall('./body/switch/video'):
- vbr = int(video.attrib['system-bitrate']) / 1000
- src = video.attrib['src']
- formats.append({
- 'url': base_url,
- 'play_path': src,
- 'ext': src.split(':')[0],
- 'vbr': vbr,
- 'format_id': '%dk' % vbr,
- })
-
- self._sort_formats(formats)
-
- title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title'])
- description = clean_html(compat_str(properties.attrib['clip_description']))
- thumbnail = properties.attrib['image']
- categories = properties.attrib['clip_categories'].split(',')
+ req = sanitized_Request(url)
+ # it doesn't work if it thinks the browser it's too old
+ req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)')
+ webpage = self._download_webpage(req, display_id)
+ theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link')
+ title = self._html_search_regex(r'<title[^>]*>([^>]+)-\s*\d+\s*|\s*Movieclips.com</title>', webpage, 'title')
+ description = self._html_search_meta('description', webpage)
return {
- 'id': video_id,
- 'display_id': display_id,
+ '_type': 'url_transparent',
+ 'url': theplatform_link,
'title': title,
+ 'display_id': display_id,
'description': description,
- 'thumbnail': thumbnail,
- 'categories': categories,
- 'formats': formats,
}
diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py
index 6db3c67a5..5a1bee5c8 100644
--- a/youtube_dl/extractor/mpora.py
+++ b/youtube_dl/extractor/mpora.py
@@ -5,7 +5,7 @@ from ..utils import int_or_none
class MporaIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)'
+ _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)'
IE_NAME = 'MPORA'
_TEST = {
@@ -25,7 +25,9 @@ class MporaIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
data_json = self._search_regex(
- r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;", webpage, 'json')
+ [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;",
+ r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"],
+ webpage, 'json')
data = self._parse_json(data_json, video_id)
uploader = data['info_overlay'].get('username')
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index bc7f49ebb..d887583e6 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -2,10 +2,9 @@ from __future__ import unicode_literals
import re
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
- compat_urllib_request,
compat_str,
)
from ..utils import (
@@ -13,6 +12,7 @@ from ..utils import (
find_xpath_attr,
fix_xml_ampersands,
HEADRequest,
+ sanitized_Request,
unescapeHTML,
url_basename,
RegexNotFoundError,
@@ -23,8 +23,9 @@ def _media_xml_tag(tag):
return '{http://search.yahoo.com/mrss/}%s' % tag
-class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
+class MTVServicesInfoExtractor(InfoExtractor):
_MOBILE_TEMPLATE = None
+ _LANG = None
@staticmethod
def _id_from_uri(uri):
@@ -52,7 +53,7 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
def _extract_mobile_video_formats(self, mtvn_id):
webpage_url = self._MOBILE_TEMPLATE % mtvn_id
- req = compat_urllib_request.Request(webpage_url)
+ req = sanitized_Request(webpage_url)
# Otherwise we get a webpage that would execute some javascript
req.add_header('User-Agent', 'curl/7')
webpage = self._download_webpage(req, mtvn_id,
@@ -66,7 +67,7 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
return [{'url': url, 'ext': 'mp4'}]
def _extract_video_formats(self, mdoc, mtvn_id):
- if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None:
+ if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None:
if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
self.to_screen('The normal version is not available from your '
'country, trying with the mobile version')
@@ -95,25 +96,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
def _extract_subtitles(self, mdoc, mtvn_id):
subtitles = {}
- FORMATS = {
- 'scc': 'cea-608',
- 'eia-608': 'cea-608',
- 'xml': 'ttml',
- }
- subtitles_format = FORMATS.get(
- self._downloader.params.get('subtitlesformat'), 'ttml')
for transcript in mdoc.findall('.//transcript'):
if transcript.get('kind') != 'captions':
continue
lang = transcript.get('srclang')
- for typographic in transcript.findall('./typographic'):
- captions_format = typographic.get('format')
- if captions_format == subtitles_format:
- subtitles[lang] = compat_str(typographic.get('src'))
- break
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(mtvn_id, subtitles)
- return self.extract_subtitles(mtvn_id, subtitles)
+ subtitles[lang] = [{
+ 'url': compat_str(typographic.get('src')),
+ 'ext': typographic.get('format')
+ } for typographic in transcript.findall('./typographic')]
+ return subtitles
def _get_video_info(self, itemdoc):
uri = itemdoc.find('guid').text
@@ -123,11 +114,20 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
# Remove the templates, like &device={device}
mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
if 'acceptMethods' not in mediagen_url:
- mediagen_url += '&acceptMethods=fms'
+ mediagen_url += '&' if '?' in mediagen_url else '?'
+ mediagen_url += 'acceptMethods=fms'
mediagen_doc = self._download_xml(mediagen_url, video_id,
'Downloading video urls')
+ item = mediagen_doc.find('./video/item')
+ if item is not None and item.get('type') == 'text':
+ message = '%s returned error: ' % self.IE_NAME
+ if item.get('code') is not None:
+ message += '%s - ' % item.get('code')
+ message += item.text
+ raise ExtractorError(message, expected=True)
+
description_node = itemdoc.find('description')
if description_node is not None:
description = description_node.text.strip()
@@ -142,7 +142,7 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
if title_el is None:
title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
if title_el is None:
- title_el = itemdoc.find('.//title')
+ title_el = itemdoc.find('.//title') or itemdoc.find('./title')
if title_el.text is None:
title_el = None
@@ -171,8 +171,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
video_id = self._id_from_uri(uri)
feed_url = self._get_feed_url(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
+ info_url = feed_url + '?'
+ if self._LANG:
+ info_url += 'lang=%s&' % self._LANG
+ info_url += data
+ return self._get_videos_info_from_url(info_url, video_id)
+
+ def _get_videos_info_from_url(self, url, video_id):
idoc = self._download_xml(
- feed_url + '?' + data, video_id,
+ url, video_id,
'Downloading info', transform_source=fix_xml_ampersands)
return self.playlist_result(
[self._get_video_info(item) for item in idoc.findall('.//item')])
@@ -193,11 +200,15 @@ class MTVServicesInfoExtractor(SubtitlesInfoExtractor):
if mgid is None or ':' not in mgid:
mgid = self._search_regex(
[r'data-mgid="(.*?)"', r'swfobject.embedSWF\(".*?(mgid:.*?)"'],
- webpage, 'mgid')
+ webpage, 'mgid', default=None)
+
+ if not mgid:
+ sm4_embed = self._html_search_meta(
+ 'sm4:video:embed', webpage, 'sm4 embed', default='')
+ mgid = self._search_regex(
+ r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid')
videos_info = self._get_videos_info(mgid)
- if self._downloader.params.get('listsubtitles', False):
- return
return videos_info
@@ -217,6 +228,13 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
},
}
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
def _get_feed_url(self, uri):
video_id = self._id_from_uri(uri)
site_id = uri.replace(video_id, '')
@@ -287,3 +305,65 @@ class MTVIggyIE(MTVServicesInfoExtractor):
}
}
_FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/'
+
+
+class MTVDEIE(MTVServicesInfoExtractor):
+ IE_NAME = 'mtv.de'
+ _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$'
+ _TESTS = [{
+ 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum',
+ 'info_dict': {
+ 'id': 'music_video-a50bc5f0b3aa4b3190aa',
+ 'ext': 'mp4',
+ 'title': 'MusicVideo_cro-traum',
+ 'description': 'Cro - Traum',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
+ 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen',
+ 'info_dict': {
+ 'id': 'local_playlist-f5ae778b9832cc837189',
+ 'ext': 'mp4',
+ 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # single video in pagePlaylist with different id
+ 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3',
+ 'info_dict': {
+ 'id': 'local_playlist-4e760566473c4c8c5344',
+ 'ext': 'mp4',
+ 'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1',
+ 'description': 'MTV Movies Supercut',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'),
+ video_id)
+
+ # news pages contain single video in playlist with different id
+ if len(playlist) == 1:
+ return self._get_videos_info_from_url(playlist[0]['mrss'], video_id)
+
+ for item in playlist:
+ item_id = item.get('id')
+ if item_id and compat_str(item_id) == video_id:
+ return self._get_videos_info_from_url(item['mrss'], video_id)
diff --git a/youtube_dl/extractor/musicvault.py b/youtube_dl/extractor/musicvault.py
deleted file mode 100644
index ebb1eb8e9..000000000
--- a/youtube_dl/extractor/musicvault.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- parse_duration,
- unified_strdate,
-)
-
-
-class MusicVaultIE(InfoExtractor):
- _VALID_URL = r'https?://www\.musicvault\.com/(?P<uploader_id>[^/?#]*)/video/(?P<display_id>[^/?#]*)_(?P<id>[0-9]+)\.html'
- _TEST = {
- 'url': 'http://www.musicvault.com/the-allman-brothers-band/video/straight-from-the-heart_1010863.html',
- 'md5': '2cdbb3ae75f7fb3519821507d2fb3c15',
- 'info_dict': {
- 'id': '1010863',
- 'ext': 'mp4',
- 'uploader_id': 'the-allman-brothers-band',
- 'title': 'Straight from the Heart',
- 'duration': 244,
- 'uploader': 'The Allman Brothers Band',
- 'thumbnail': 're:^https?://.*/thumbnail/.*',
- 'upload_date': '19811216',
- 'location': 'Capitol Theatre (Passaic, NJ)',
- 'description': 'Listen to The Allman Brothers Band perform Straight from the Heart at Capitol Theatre (Passaic, NJ) on Dec 16, 1981',
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, display_id)
-
- thumbnail = self._search_regex(
- r'<meta itemprop="thumbnail" content="([^"]+)"',
- webpage, 'thumbnail', fatal=False)
-
- data_div = self._search_regex(
- r'(?s)<div class="data">(.*?)</div>', webpage, 'data fields')
- uploader = self._html_search_regex(
- r'<h1.*?>(.*?)</h1>', data_div, 'uploader', fatal=False)
- title = self._html_search_regex(
- r'<h2.*?>(.*?)</h2>', data_div, 'title')
- upload_date = unified_strdate(self._html_search_regex(
- r'<h3.*?>(.*?)</h3>', data_div, 'uploader', fatal=False))
- location = self._html_search_regex(
- r'<h4.*?>(.*?)</h4>', data_div, 'location', fatal=False)
-
- duration = parse_duration(self._html_search_meta('duration', webpage))
-
- VIDEO_URL_TEMPLATE = 'http://cdnapi.kaltura.com/p/%(uid)s/sp/%(wid)s/playManifest/entryId/%(entry_id)s/format/url/protocol/http'
- kaltura_id = self._search_regex(
- r'<div id="video-detail-player" data-kaltura-id="([^"]+)"',
- webpage, 'kaltura ID')
- video_url = VIDEO_URL_TEMPLATE % {
- 'entry_id': kaltura_id,
- 'wid': self._search_regex(r'/wid/_([0-9]+)/', webpage, 'wid'),
- 'uid': self._search_regex(r'uiconf_id/([0-9]+)/', webpage, 'uid'),
- }
-
- return {
- 'id': mobj.group('id'),
- 'url': video_url,
- 'ext': 'mp4',
- 'display_id': display_id,
- 'uploader_id': mobj.group('uploader_id'),
- 'thumbnail': thumbnail,
- 'description': self._html_search_meta('description', webpage),
- 'upload_date': upload_date,
- 'location': location,
- 'title': title,
- 'uploader': uploader,
- 'duration': duration,
- }
diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py
new file mode 100644
index 000000000..66b523197
--- /dev/null
+++ b/youtube_dl/extractor/mwave.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
+
+
+class MwaveIE(InfoExtractor):
+ _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859',
+ 'md5': 'c930e27b7720aaa3c9d0018dfc8ff6cc',
+ 'info_dict': {
+ 'id': '168859',
+ 'ext': 'flv',
+ 'title': '[M COUNTDOWN] SISTAR - SHAKE IT',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'M COUNTDOWN',
+ 'duration': 206,
+ 'view_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ vod_info = self._download_json(
+ 'http://mwave.interest.me/onair/vod_info.m?vodtype=CL&sectorid=&endinfo=Y&id=%s' % video_id,
+ video_id, 'Download vod JSON')
+
+ formats = []
+ for num, cdn_info in enumerate(vod_info['cdn']):
+ stream_url = cdn_info.get('url')
+ if not stream_url:
+ continue
+ stream_name = cdn_info.get('name') or compat_str(num)
+ f4m_stream = self._download_json(
+ stream_url, video_id,
+ 'Download %s stream JSON' % stream_name)
+ f4m_url = f4m_stream.get('fileurl')
+ if not f4m_url:
+ continue
+ formats.extend(
+ self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': vod_info['title'],
+ 'thumbnail': vod_info.get('cover'),
+ 'uploader': vod_info.get('program_title'),
+ 'duration': parse_duration(vod_info.get('time')),
+ 'view_count': int_or_none(vod_info.get('hit')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index 5b9b9fbcd..4557a2b13 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -35,7 +35,8 @@ class MySpassIE(InfoExtractor):
# get metadata
metadata_url = META_DATA_URL_TEMPLATE % video_id
- metadata = self._download_xml(metadata_url, video_id)
+ metadata = self._download_xml(
+ metadata_url, video_id, transform_source=lambda s: s.strip())
# extract values from metadata
url_flv_el = metadata.find('url_flv')
diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py
new file mode 100644
index 000000000..4c65be122
--- /dev/null
+++ b/youtube_dl/extractor/myvi.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .vimple import SprutoBaseIE
+
+
+class MyviIE(SprutoBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://
+ myvi\.(?:ru/player|tv)/
+ (?:
+ (?:
+ embed/html|
+ flash|
+ api/Video/Get
+ )/|
+ content/preloader\.swf\?.*\bid=
+ )
+ (?P<id>[\da-zA-Z_-]+)
+ '''
+ _TESTS = [{
+ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+ 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf',
+ 'info_dict': {
+ 'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43',
+ 'ext': 'mp4',
+ 'title': 'хозяин жизни',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 25,
+ },
+ }, {
+ 'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ spruto = self._download_json(
+ 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData']
+
+ return self._extract_spruto(spruto, video_id)
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index 5e754fcff..36ab388b2 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -10,10 +10,11 @@ from .common import InfoExtractor
from ..compat import (
compat_ord,
compat_urllib_parse,
- compat_urllib_request,
+ compat_urllib_parse_unquote,
)
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
@@ -82,7 +83,7 @@ class MyVideoIE(InfoExtractor):
mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage)
if mobj is not None:
- request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '')
+ request = sanitized_Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '')
response = self._download_webpage(request, video_id,
'Downloading video info')
info = json.loads(base64.b64decode(response).decode('utf-8'))
@@ -107,7 +108,7 @@ class MyVideoIE(InfoExtractor):
if not a == '_encxml':
params[a] = b
else:
- encxml = compat_urllib_parse.unquote(b)
+ encxml = compat_urllib_parse_unquote(b)
if not params.get('domain'):
params['domain'] = 'www.myvideo.de'
xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
@@ -135,7 +136,7 @@ class MyVideoIE(InfoExtractor):
video_url = None
mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
if mobj:
- video_url = compat_urllib_parse.unquote(mobj.group(1))
+ video_url = compat_urllib_parse_unquote(mobj.group(1))
if 'myvideo2flash' in video_url:
self.report_warning(
'Rewriting URL to use unencrypted rtmp:// ...',
@@ -147,10 +148,10 @@ class MyVideoIE(InfoExtractor):
mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
if mobj is None:
raise ExtractorError('unable to extract url')
- video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+ video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2))
video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file')
- video_file = compat_urllib_parse.unquote(video_file)
+ video_file = compat_urllib_parse_unquote(video_file)
if not video_file.endswith('f4m'):
ppath, prefix = video_file.split('.')
@@ -159,7 +160,7 @@ class MyVideoIE(InfoExtractor):
video_playpath = ''
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj')
- video_swfobj = compat_urllib_parse.unquote(video_swfobj)
+ video_swfobj = compat_urllib_parse_unquote(video_swfobj)
video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
webpage, 'title')
diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py
new file mode 100644
index 000000000..6fc9e7b05
--- /dev/null
+++ b/youtube_dl/extractor/nationalgeographic.py
@@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ smuggle_url,
+ url_basename,
+)
+
+
+class NationalGeographicIE(InfoExtractor):
+ _VALID_URL = r'http://video\.nationalgeographic\.com/.*?'
+
+ _TESTS = [
+ {
+ 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
+ 'info_dict': {
+ 'id': '4DmDACA6Qtk_',
+ 'ext': 'flv',
+ 'title': 'Mating Crabs Busted by Sharks',
+ 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+ },
+ 'add_ie': ['ThePlatform'],
+ },
+ {
+ 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws',
+ 'info_dict': {
+ 'id': '_JeBD_D7PlS5',
+ 'ext': 'flv',
+ 'title': 'The Real Jaws',
+ 'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6',
+ },
+ 'add_ie': ['ThePlatform'],
+ },
+ ]
+
+ def _real_extract(self, url):
+ name = url_basename(url)
+
+ webpage = self._download_webpage(url, name)
+ feed_url = self._search_regex(
+ r'data-feed-url="([^"]+)"', webpage, 'feed url')
+ guid = self._search_regex(
+ r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"',
+ webpage, 'guid')
+
+ feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
+ content = feed.find('.//{http://search.yahoo.com/mrss/}content')
+ theplatform_id = url_basename(content.attrib.get('url'))
+
+ return self.url_result(smuggle_url(
+ 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id,
+ # For some reason, the normal links don't work and we must force
+ # the use of f4m
+ {'force_smil_url': True}))
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index c10405f04..1f5fc2145 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -6,17 +6,17 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
- clean_html,
)
class NaverIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://tvcast.naver.com/v/81652',
'info_dict': {
'id': '81652',
@@ -25,7 +25,18 @@ class NaverIE(InfoExtractor):
'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
'upload_date': '20130903',
},
- }
+ }, {
+ 'url': 'http://tvcast.naver.com/v/395837',
+ 'md5': '638ed4c12012c458fefcddfd01f173cd',
+ 'info_dict': {
+ 'id': '395837',
+ 'ext': 'mp4',
+ 'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+ 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7',
+ 'upload_date': '20150519',
+ },
+ 'skip': 'Georestricted',
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -34,11 +45,11 @@ class NaverIE(InfoExtractor):
m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
webpage)
if m_id is None:
- m_error = re.search(
- r'(?s)<div class="nation_error">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
- webpage)
- if m_error:
- raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
+ error = self._html_search_regex(
+ r'(?s)<div class="(?:nation_error|nation_box|error_box)">\s*(?:<!--.*?-->)?\s*<p class="[^"]+">(?P<msg>.+?)</p>\s*</div>',
+ webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
raise ExtractorError('couldn\'t extract vid and key')
vid = m_id.group(1)
key = m_id.group(2)
@@ -58,14 +69,18 @@ class NaverIE(InfoExtractor):
formats = []
for format_el in urls.findall('EncodingOptions/EncodingOption'):
domain = format_el.find('Domain').text
+ uri = format_el.find('uri').text
f = {
- 'url': domain + format_el.find('uri').text,
+ 'url': compat_urlparse.urljoin(domain, uri),
'ext': 'mp4',
'width': int(format_el.find('width').text),
'height': int(format_el.find('height').text),
}
if domain.startswith('rtmp'):
+ # urlparse does not support custom schemes
+ # https://bugs.python.org/issue18828
f.update({
+ 'url': domain + uri,
'ext': 'flv',
'rtmp_protocol': '1', # rtmpt
})
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 862b706bf..944096e1c 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -22,6 +22,18 @@ class NBAIE(InfoExtractor):
}, {
'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
'only_matching': True,
+ }, {
+ 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+ 'info_dict': {
+ 'id': '0041400301-cle-atl-recap.nba',
+ 'ext': 'mp4',
+ 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1',
+ 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+ 'duration': 228,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
}]
def _real_extract(self, url):
@@ -35,8 +47,12 @@ class NBAIE(InfoExtractor):
self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com')
description = self._og_search_description(webpage)
- duration = parse_duration(
- self._html_search_meta('duration', webpage, 'duration'))
+ duration_str = self._html_search_meta(
+ 'duration', webpage, 'duration', default=None)
+ if not duration_str:
+ duration_str = self._html_search_regex(
+ r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False)
+ duration = parse_duration(duration_str)
return {
'id': shortened_video_id,
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index f840f6532..e683d24c4 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..compat import (
@@ -11,21 +10,23 @@ from ..compat import (
from ..utils import (
ExtractorError,
find_xpath_attr,
+ lowercase_escape,
+ unescapeHTML,
)
class NBCIE(InfoExtractor):
- _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+ _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
_TESTS = [
{
- 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+ 'url': 'http://www.nbc.com/the-tonight-show/segments/112966',
# md5 checksum is not stable
'info_dict': {
- 'id': 'bTmnLCvIbaaH',
+ 'id': 'c9xnCo0YPOPH',
'ext': 'flv',
- 'title': 'I Am a Firefighter',
- 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
+ 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
+ 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
},
},
{
@@ -38,23 +39,92 @@ class NBCIE(InfoExtractor):
},
'skip': 'Only works from US',
},
+ {
+ 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
+ 'info_dict': {
+ 'id': '8iUuyzWDdYUZ',
+ 'ext': 'flv',
+ 'title': 'Star Wars Teaser',
+ 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
+ },
+ 'skip': 'Only works from US',
+ },
+ {
+ # This video has expired but with an escaped embedURL
+ 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
+ 'skip': 'Expired'
+ }
]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- theplatform_url = self._search_regex(
- '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
- webpage, 'theplatform url').replace('_no_endcard', '')
+ theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
+ [
+ r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
+ r'"embedURL"\s*:\s*"([^"]+)"'
+ ],
+ webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
if theplatform_url.startswith('//'):
theplatform_url = 'http:' + theplatform_url
return self.url_result(theplatform_url)
+class NBCSportsVPlayerIE(InfoExtractor):
+ _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
+
+ _TESTS = [{
+ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI',
+ 'info_dict': {
+ 'id': '9CsDKds0kvHI',
+ 'ext': 'flv',
+ 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+ 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+ }
+ }, {
+ 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ iframe_m = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
+ if iframe_m:
+ return iframe_m.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ theplatform_url = self._og_search_video_url(webpage)
+ return self.url_result(theplatform_url, 'ThePlatform')
+
+
+class NBCSportsIE(InfoExtractor):
+ # Does not include https becuase its certificate is invalid
+ _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
+
+ _TEST = {
+ 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
+ 'info_dict': {
+ 'id': 'PHJSaFWbrTY9',
+ 'ext': 'flv',
+ 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
+ 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ return self.url_result(
+ NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
+
+
class NBCNewsIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://www\.nbcnews\.com/
- ((video/.+?/(?P<id>\d+))|
- (feature/[^/]+/(?P<title>.+)))
+ _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
+ (?:video/.+?/(?P<id>\d+)|
+ (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+))
'''
_TESTS = [
@@ -89,6 +159,20 @@ class NBCNewsIE(InfoExtractor):
'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
},
},
+ {
+ 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
+ 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d',
+ 'info_dict': {
+ 'id': 'sekXqyTVnmN3',
+ 'ext': 'mp4',
+ 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
+ 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
+ },
+ },
+ {
+ 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -107,13 +191,13 @@ class NBCNewsIE(InfoExtractor):
'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
}
else:
- # "feature" pages use theplatform.com
+ # "feature" and "nightly-news" pages use theplatform.com
title = mobj.group('title')
webpage = self._download_webpage(url, title)
bootstrap_json = self._search_regex(
- r'var bootstrapJson = ({.+})\s*$', webpage, 'bootstrap json',
- flags=re.MULTILINE)
- bootstrap = json.loads(bootstrap_json)
+ r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
+ webpage, 'bootstrap json', flags=re.MULTILINE)
+ bootstrap = self._parse_json(bootstrap_json, video_id)
info = bootstrap['results'][0]['video']
mpxid = info['mpxId']
@@ -152,3 +236,28 @@ class NBCNewsIE(InfoExtractor):
'url': info['videoAssets'][-1]['publicUrl'],
'ie_key': 'ThePlatform',
}
+
+
+class MSNBCIE(InfoExtractor):
+ # https URLs redirect to corresponding http ones
+ _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
+ 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
+ 'info_dict': {
+ 'id': 'n_hayes_Aimm_140801_272214',
+ 'ext': 'mp4',
+ 'title': 'The chaotic GOP immigration vote',
+ 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1406937606,
+ 'upload_date': '20140802',
+ 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ embed_url = self._html_search_meta('embedURL', webpage)
+ return self.url_result(embed_url)
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index f49c66690..16213eed9 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -1,94 +1,387 @@
-# encoding: utf-8
+# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
+ determine_ext,
int_or_none,
+ parse_iso8601,
qualities,
)
-class NDRIE(InfoExtractor):
+class NDRBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = next(group for group in mobj.groups() if group)
+ webpage = self._download_webpage(url, display_id)
+ return self._extract_embed(webpage, display_id)
+
+
+class NDRIE(NDRBaseIE):
IE_NAME = 'ndr'
- IE_DESC = 'NDR.de - Mediathek'
- _VALID_URL = r'https?://www\.ndr\.de/.+?(?P<id>\d+)\.html'
-
- _TESTS = [
- {
- 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html',
- 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c',
- 'note': 'Video file',
- 'info_dict': {
- 'id': '25866',
- 'ext': 'mp4',
- 'title': 'Kartoffeltage in der Lewitz',
- 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8',
- 'duration': 166,
- }
- },
- {
- 'url': 'http://www.ndr.de/info/audio51535.html',
- 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
- 'note': 'Audio file',
- 'info_dict': {
- 'id': '51535',
- 'ext': 'mp3',
- 'title': 'La Valette entgeht der Hinrichtung',
- 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
- 'duration': 884,
- }
+ IE_DESC = 'NDR.de - Norddeutscher Rundfunk'
+ _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html'
+ _TESTS = [{
+ # httpVideo, same content id
+ 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
+ 'md5': '6515bc255dc5c5f8c85bbc38e035a659',
+ 'info_dict': {
+ 'id': 'hafengeburtstag988',
+ 'display_id': 'Party-Poette-und-Parade',
+ 'ext': 'mp4',
+ 'title': 'Party, Pötte und Parade',
+ 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c',
+ 'uploader': 'ndrtv',
+ 'timestamp': 1431108900,
+ 'upload_date': '20150510',
+ 'duration': 3498,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpVideo, different content id
+ 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html',
+ 'md5': '1043ff203eab307f0c51702ec49e9a71',
+ 'info_dict': {
+ 'id': 'osna272',
+ 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch',
+ 'ext': 'mp4',
+ 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights',
+ 'description': 'md5:32e9b800b3d2d4008103752682d5dc01',
+ 'uploader': 'ndrtv',
+ 'timestamp': 1442059200,
+ 'upload_date': '20150912',
+ 'duration': 510,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpAudio, same content id
+ 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html',
+ 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+ 'info_dict': {
+ 'id': 'audio51535',
+ 'display_id': 'La-Valette-entgeht-der-Hinrichtung',
+ 'ext': 'mp3',
+ 'title': 'La Valette entgeht der Hinrichtung',
+ 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
+ 'uploader': 'ndrinfo',
+ 'timestamp': 1290626100,
+ 'upload_date': '20140729',
+ 'duration': 884,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
+ 'only_matching': True,
+ }]
+
+ def _extract_embed(self, webpage, display_id):
+ embed_url = self._html_search_meta(
+ 'embedURL', webpage, 'embed URL', fatal=True)
+ description = self._search_regex(
+ r'<p[^>]+itemprop="description">([^<]+)</p>',
+ webpage, 'description', fatal=False)
+ timestamp = parse_iso8601(
+ self._search_regex(
+ r'<span itemprop="datePublished" content="([^"]+)">',
+ webpage, 'upload date', fatal=False))
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ 'display_id': display_id,
+ 'description': description,
+ 'timestamp': timestamp,
+ }
+
+
+class NJoyIE(NDRBaseIE):
+ IE_NAME = 'njoy'
+ IE_DESC = 'N-JOY'
+ _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html'
+ _TESTS = [{
+ # httpVideo, same content id
+ 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html',
+ 'md5': 'cb63be60cd6f9dd75218803146d8dc67',
+ 'info_dict': {
+ 'id': 'comedycontest2480',
+ 'display_id': 'Benaissa-beim-NDR-Comedy-Contest',
+ 'ext': 'mp4',
+ 'title': 'Benaissa beim NDR Comedy Contest',
+ 'description': 'md5:f057a6c4e1c728b10d33b5ffd36ddc39',
+ 'uploader': 'ndrtv',
+ 'upload_date': '20141129',
+ 'duration': 654,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpVideo, different content id
+ 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html',
+ 'md5': '417660fffa90e6df2fda19f1b40a64d8',
+ 'info_dict': {
+ 'id': 'dockville882',
+ 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-',
+ 'ext': 'mp4',
+ 'title': '"Ich hab noch nie" mit Felix Jaehn',
+ 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3',
+ 'uploader': 'njoy',
+ 'upload_date': '20150822',
+ 'duration': 211,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.n-joy.de/radio/webradio/morningshow209.html',
+ 'only_matching': True,
+ }]
+
+ def _extract_embed(self, webpage, display_id):
+ video_id = self._search_regex(
+ r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id')
+ description = self._search_regex(
+ r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>',
+ webpage, 'description', fatal=False)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'NDREmbedBase',
+ 'url': 'ndr:%s' % video_id,
+ 'display_id': display_id,
+ 'description': description,
}
- ]
+
+
+class NDREmbedBaseIE(InfoExtractor):
+ IE_NAME = 'ndr:embed:base'
+ _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)'
+ _TESTS = [{
+ 'url': 'ndr:soundcheck3366',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/soundcheck3366-ppjson.json',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage(url, video_id, 'Downloading page')
+ video_id = mobj.group('id') or mobj.group('id_s')
- title = self._og_search_title(page).strip()
- description = self._og_search_description(page)
- if description:
- description = description.strip()
+ ppjson = self._download_json(
+ 'http://www.ndr.de/%s-ppjson.json' % video_id, video_id)
- duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))
+ playlist = ppjson['playlist']
formats = []
+ quality_key = qualities(('xs', 's', 'm', 'l', 'xl'))
- mp3_url = re.search(r'''\{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
- if mp3_url:
- formats.append({
- 'url': mp3_url.group('audio'),
- 'format_id': 'mp3',
- })
+ for format_id, f in playlist.items():
+ src = f.get('src')
+ if not src:
+ continue
+ ext = determine_ext(src, None)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds'))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, m3u8_id='hls', entry_protocol='m3u8_native'))
+ else:
+ quality = f.get('quality')
+ ff = {
+ 'url': src,
+ 'format_id': quality or format_id,
+ 'quality': quality_key(quality),
+ }
+ type_ = f.get('type')
+ if type_ and type_.split('/')[0] == 'audio':
+ ff['vcodec'] = 'none'
+ ff['ext'] = ext or 'mp3'
+ formats.append(ff)
+ self._sort_formats(formats)
- thumbnail = None
+ config = playlist['config']
- video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.(lo|hi|hq)\.mp4', type:"video/mp4"},''', page)
- if video_url:
- thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
- if thumbnails:
- quality_key = qualities(['xs', 's', 'm', 'l', 'xl'])
- largest = max(thumbnails, key=lambda thumb: quality_key(thumb[1]))
- thumbnail = 'http://www.ndr.de' + largest[0]
+ live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive']
+ title = config['title']
+ if live:
+ title = self._live_title(title)
+ uploader = ppjson.get('config', {}).get('branding')
+ upload_date = ppjson.get('config', {}).get('publicationDate')
+ duration = int_or_none(config.get('duration'))
- for format_id in 'lo', 'hi', 'hq':
- formats.append({
- 'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
- 'format_id': format_id,
- })
-
- if not formats:
- raise ExtractorError('No media links available for %s' % video_id)
+ thumbnails = [{
+ 'id': thumbnail.get('quality') or thumbnail_id,
+ 'url': thumbnail['src'],
+ 'preference': quality_key(thumbnail.get('quality')),
+ } for thumbnail_id, thumbnail in config.get('poster', {}).items() if thumbnail.get('src')]
return {
'id': video_id,
'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
+ 'is_live': live,
+ 'uploader': uploader if uploader != '-' else None,
+ 'upload_date': upload_date[0:8] if upload_date else None,
'duration': duration,
+ 'thumbnails': thumbnails,
'formats': formats,
}
+
+
+class NDREmbedIE(NDREmbedBaseIE):
+ IE_NAME = 'ndr:embed'
+ _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html'
+ _TESTS = [{
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html',
+ 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9',
+ 'info_dict': {
+ 'id': 'ndraktuell28488',
+ 'ext': 'mp4',
+ 'title': 'Norddeutschland begrüßt Flüchtlinge',
+ 'is_live': False,
+ 'uploader': 'ndrtv',
+ 'upload_date': '20150907',
+ 'duration': 132,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html',
+ 'md5': '002085c44bae38802d94ae5802a36e78',
+ 'info_dict': {
+ 'id': 'soundcheck3366',
+ 'ext': 'mp4',
+ 'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen',
+ 'is_live': False,
+ 'uploader': 'ndr2',
+ 'upload_date': '20150912',
+ 'duration': 3554,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/info/audio51535-player.html',
+ 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+ 'info_dict': {
+ 'id': 'audio51535',
+ 'ext': 'mp3',
+ 'title': 'La Valette entgeht der Hinrichtung',
+ 'is_live': False,
+ 'uploader': 'ndrinfo',
+ 'upload_date': '20140729',
+ 'duration': 884,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html',
+ 'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c',
+ 'info_dict': {
+ 'id': 'visite11010',
+ 'ext': 'mp4',
+ 'title': 'Visite - die ganze Sendung',
+ 'is_live': False,
+ 'uploader': 'ndrtv',
+ 'upload_date': '20150902',
+ 'duration': 3525,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpVideoLive
+ 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html',
+ 'info_dict': {
+ 'id': 'livestream217',
+ 'ext': 'flv',
+ 'title': 're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'is_live': True,
+ 'upload_date': '20150910',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ndr.de/fernsehen/doku952-player.html',
+ 'only_matching': True,
+ }]
+
+
+class NJoyEmbedIE(NDREmbedBaseIE):
+ IE_NAME = 'njoy:embed'
+ _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html'
+ _TESTS = [{
+ # httpVideo
+ 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html',
+ 'md5': '8483cbfe2320bd4d28a349d62d88bd74',
+ 'info_dict': {
+ 'id': 'doku948',
+ 'ext': 'mp4',
+ 'title': 'Zehn Jahre Reeperbahn Festival - die Doku',
+ 'is_live': False,
+ 'upload_date': '20150807',
+ 'duration': 1011,
+ },
+ }, {
+ # httpAudio
+ 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html',
+ 'md5': 'd989f80f28ac954430f7b8a48197188a',
+ 'info_dict': {
+ 'id': 'stefanrichter100',
+ 'ext': 'mp3',
+ 'title': 'Interview mit einem Augenzeugen',
+ 'is_live': False,
+ 'uploader': 'njoy',
+ 'upload_date': '20150909',
+ 'duration': 140,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # httpAudioLive, no explicit ext
+ 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html',
+ 'info_dict': {
+ 'id': 'webradioweltweit100',
+ 'ext': 'mp3',
+ 'title': 're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'is_live': True,
+ 'uploader': 'njoy',
+ 'upload_date': '20150810',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html',
+ 'only_matching': True,
+ }]
diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py
new file mode 100644
index 000000000..15eca825a
--- /dev/null
+++ b/youtube_dl/extractor/neteasemusic.py
@@ -0,0 +1,459 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from hashlib import md5
+from base64 import b64encode
+from datetime import datetime
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_str,
+ compat_itertools_count,
+)
+from ..utils import sanitized_Request
+
+
+class NetEaseMusicBaseIE(InfoExtractor):
+ _FORMATS = ['bMusic', 'mMusic', 'hMusic']
+ _NETEASE_SALT = '3go8&$8*3*3h0k(2)2'
+ _API_BASE = 'http://music.163.com/api/'
+
+ @classmethod
+ def _encrypt(cls, dfsid):
+ salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8'))
+ string_bytes = bytearray(compat_str(dfsid).encode('ascii'))
+ salt_len = len(salt_bytes)
+ for i in range(len(string_bytes)):
+ string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len]
+ m = md5()
+ m.update(bytes(string_bytes))
+ result = b64encode(m.digest()).decode('ascii')
+ return result.replace('/', '_').replace('+', '-')
+
+ @classmethod
+ def extract_formats(cls, info):
+ formats = []
+ for song_format in cls._FORMATS:
+ details = info.get(song_format)
+ if not details:
+ continue
+ formats.append({
+ 'url': 'http://m5.music.126.net/%s/%s.%s' %
+ (cls._encrypt(details['dfsId']), details['dfsId'],
+ details['extension']),
+ 'ext': details.get('extension'),
+ 'abr': details.get('bitrate', 0) / 1000,
+ 'format_id': song_format,
+ 'filesize': details.get('size'),
+ 'asr': details.get('sr')
+ })
+ return formats
+
+ @classmethod
+ def convert_milliseconds(cls, ms):
+ return int(round(ms / 1000.0))
+
+ def query_api(self, endpoint, video_id, note):
+ req = sanitized_Request('%s%s' % (self._API_BASE, endpoint))
+ req.add_header('Referer', self._API_BASE)
+ return self._download_json(req, video_id, note)
+
+
+class NetEaseMusicIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:song'
+ IE_DESC = '网易云音乐'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://music.163.com/#/song?id=32102397',
+ 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
+ 'info_dict': {
+ 'id': '32102397',
+ 'ext': 'mp3',
+ 'title': 'Bad Blood (feat. Kendrick Lamar)',
+ 'creator': 'Taylor Swift / Kendrick Lamar',
+ 'upload_date': '20150517',
+ 'timestamp': 1431878400,
+ 'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
+ },
+ }, {
+ 'note': 'No lyrics translation.',
+ 'url': 'http://music.163.com/#/song?id=29822014',
+ 'info_dict': {
+ 'id': '29822014',
+ 'ext': 'mp3',
+ 'title': '听见下雨的声音',
+ 'creator': '周杰伦',
+ 'upload_date': '20141225',
+ 'timestamp': 1419523200,
+ 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
+ },
+ }, {
+ 'note': 'No lyrics.',
+ 'url': 'http://music.163.com/song?id=17241424',
+ 'info_dict': {
+ 'id': '17241424',
+ 'ext': 'mp3',
+ 'title': 'Opus 28',
+ 'creator': 'Dustin O\'Halloran',
+ 'upload_date': '20080211',
+ 'timestamp': 1202745600,
+ },
+ }, {
+ 'note': 'Has translated name.',
+ 'url': 'http://music.163.com/#/song?id=22735043',
+ 'info_dict': {
+ 'id': '22735043',
+ 'ext': 'mp3',
+ 'title': '소원을 말해봐 (Genie)',
+ 'creator': '少女时代',
+ 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184',
+ 'upload_date': '20100127',
+ 'timestamp': 1264608000,
+ 'alt_title': '说出愿望吧(Genie)',
+ }
+ }]
+
+ def _process_lyrics(self, lyrics_info):
+ original = lyrics_info.get('lrc', {}).get('lyric')
+ translated = lyrics_info.get('tlyric', {}).get('lyric')
+
+ if not translated:
+ return original
+
+ lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)'
+ original_ts_texts = re.findall(lyrics_expr, original)
+ translation_ts_dict = dict(
+ (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated)
+ )
+ lyrics = '\n'.join([
+ '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, ''))
+ for time_stamp, text in original_ts_texts
+ ])
+ return lyrics
+
+ def _real_extract(self, url):
+ song_id = self._match_id(url)
+
+ params = {
+ 'id': song_id,
+ 'ids': '[%s]' % song_id
+ }
+ info = self.query_api(
+ 'song/detail?' + compat_urllib_parse.urlencode(params),
+ song_id, 'Downloading song info')['songs'][0]
+
+ formats = self.extract_formats(info)
+ self._sort_formats(formats)
+
+ lyrics_info = self.query_api(
+ 'song/lyric?id=%s&lv=-1&tv=-1' % song_id,
+ song_id, 'Downloading lyrics data')
+ lyrics = self._process_lyrics(lyrics_info)
+
+ alt_title = None
+ if info.get('transNames'):
+ alt_title = '/'.join(info.get('transNames'))
+
+ return {
+ 'id': song_id,
+ 'title': info['name'],
+ 'alt_title': alt_title,
+ 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]),
+ 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')),
+ 'thumbnail': info.get('album', {}).get('picUrl'),
+ 'duration': self.convert_milliseconds(info.get('duration', 0)),
+ 'description': lyrics,
+ 'formats': formats,
+ }
+
+
+class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:album'
+ IE_DESC = '网易云音乐 - 专辑'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://music.163.com/#/album?id=220780',
+ 'info_dict': {
+ 'id': '220780',
+ 'title': 'B\'day',
+ },
+ 'playlist_count': 23,
+ }
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+
+ info = self.query_api(
+ 'album/%s?id=%s' % (album_id, album_id),
+ album_id, 'Downloading album data')['album']
+
+ name = info['name']
+ desc = info.get('description')
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+ 'NetEaseMusic', song['id'])
+ for song in info['songs']
+ ]
+ return self.playlist_result(entries, album_id, name, desc)
+
+
+class NetEaseMusicSingerIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:singer'
+ IE_DESC = '网易云音乐 - 歌手'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'note': 'Singer has aliases.',
+ 'url': 'http://music.163.com/#/artist?id=10559',
+ 'info_dict': {
+ 'id': '10559',
+ 'title': '张惠妹 - aMEI;阿密特',
+ },
+ 'playlist_count': 50,
+ }, {
+ 'note': 'Singer has translated name.',
+ 'url': 'http://music.163.com/#/artist?id=124098',
+ 'info_dict': {
+ 'id': '124098',
+ 'title': '李昇基 - 이승기',
+ },
+ 'playlist_count': 50,
+ }]
+
+ def _real_extract(self, url):
+ singer_id = self._match_id(url)
+
+ info = self.query_api(
+ 'artist/%s?id=%s' % (singer_id, singer_id),
+ singer_id, 'Downloading singer data')
+
+ name = info['artist']['name']
+ if info['artist']['trans']:
+ name = '%s - %s' % (name, info['artist']['trans'])
+ if info['artist']['alias']:
+ name = '%s - %s' % (name, ';'.join(info['artist']['alias']))
+
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+ 'NetEaseMusic', song['id'])
+ for song in info['hotSongs']
+ ]
+ return self.playlist_result(entries, singer_id, name)
+
+
+class NetEaseMusicListIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:playlist'
+ IE_DESC = '网易云音乐 - 歌单'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://music.163.com/#/playlist?id=79177352',
+ 'info_dict': {
+ 'id': '79177352',
+ 'title': 'Billboard 2007 Top 100',
+ 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022'
+ },
+ 'playlist_count': 99,
+ }, {
+ 'note': 'Toplist/Charts sample',
+ 'url': 'http://music.163.com/#/discover/toplist?id=3733003',
+ 'info_dict': {
+ 'id': '3733003',
+ 'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}',
+ 'description': 'md5:73ec782a612711cadc7872d9c1e134fc',
+ },
+ 'playlist_count': 50,
+ }]
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ info = self.query_api(
+ 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id,
+ list_id, 'Downloading playlist data')['result']
+
+ name = info['name']
+ desc = info.get('description')
+
+ if info.get('specialType') == 10: # is a chart/toplist
+ datestamp = datetime.fromtimestamp(
+ self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d')
+ name = '%s %s' % (name, datestamp)
+
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+ 'NetEaseMusic', song['id'])
+ for song in info['tracks']
+ ]
+ return self.playlist_result(entries, list_id, name, desc)
+
+
+class NetEaseMusicMvIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:mv'
+ IE_DESC = '网易云音乐 - MV'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://music.163.com/#/mv?id=415350',
+ 'info_dict': {
+ 'id': '415350',
+ 'ext': 'mp4',
+ 'title': '이럴거면 그러지말지',
+ 'description': '白雅言自作曲唱甜蜜爱情',
+ 'creator': '白雅言',
+ 'upload_date': '20150520',
+ },
+ }
+
+ def _real_extract(self, url):
+ mv_id = self._match_id(url)
+
+ info = self.query_api(
+ 'mv/detail?id=%s&type=mp4' % mv_id,
+ mv_id, 'Downloading mv info')['data']
+
+ formats = [
+ {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)}
+ for brs, mv_url in info['brs'].items()
+ ]
+ self._sort_formats(formats)
+
+ return {
+ 'id': mv_id,
+ 'title': info['name'],
+ 'description': info.get('desc') or info.get('briefDesc'),
+ 'creator': info['artistName'],
+ 'upload_date': info['publishTime'].replace('-', ''),
+ 'formats': formats,
+ 'thumbnail': info.get('cover'),
+ 'duration': self.convert_milliseconds(info.get('duration', 0)),
+ }
+
+
+class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:program'
+ IE_DESC = '网易云音乐 - 电台节目'
+ _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://music.163.com/#/program?id=10109055',
+ 'info_dict': {
+ 'id': '10109055',
+ 'ext': 'mp3',
+ 'title': '不丹足球背后的故事',
+ 'description': '喜马拉雅人的足球梦 ...',
+ 'creator': '大话西藏',
+ 'timestamp': 1434179342,
+ 'upload_date': '20150613',
+ 'duration': 900,
+ },
+ }, {
+ 'note': 'This program has accompanying songs.',
+ 'url': 'http://music.163.com/#/program?id=10141022',
+ 'info_dict': {
+ 'id': '10141022',
+ 'title': '25岁,你是自在如风的少年<27°C>',
+ 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+ },
+ 'playlist_count': 4,
+ }, {
+ 'note': 'This program has accompanying songs.',
+ 'url': 'http://music.163.com/#/program?id=10141022',
+ 'info_dict': {
+ 'id': '10141022',
+ 'ext': 'mp3',
+ 'title': '25岁,你是自在如风的少年<27°C>',
+ 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+ 'timestamp': 1434450841,
+ 'upload_date': '20150616',
+ },
+ 'params': {
+ 'noplaylist': True
+ }
+ }]
+
+ def _real_extract(self, url):
+ program_id = self._match_id(url)
+
+ info = self.query_api(
+ 'dj/program/detail?id=%s' % program_id,
+ program_id, 'Downloading program info')['program']
+
+ name = info['name']
+ description = info['description']
+
+ if not info['songs'] or self._downloader.params.get('noplaylist'):
+ if info['songs']:
+ self.to_screen(
+ 'Downloading just the main audio %s because of --no-playlist'
+ % info['mainSong']['id'])
+
+ formats = self.extract_formats(info['mainSong'])
+ self._sort_formats(formats)
+
+ return {
+ 'id': program_id,
+ 'title': name,
+ 'description': description,
+ 'creator': info['dj']['brand'],
+ 'timestamp': self.convert_milliseconds(info['createTime']),
+ 'thumbnail': info['coverUrl'],
+ 'duration': self.convert_milliseconds(info.get('duration', 0)),
+ 'formats': formats,
+ }
+
+ self.to_screen(
+ 'Downloading playlist %s - add --no-playlist to just download the main audio %s'
+ % (program_id, info['mainSong']['id']))
+
+ song_ids = [info['mainSong']['id']]
+ song_ids.extend([song['id'] for song in info['songs']])
+ entries = [
+ self.url_result('http://music.163.com/#/song?id=%s' % song_id,
+ 'NetEaseMusic', song_id)
+ for song_id in song_ids
+ ]
+ return self.playlist_result(entries, program_id, name, description)
+
+
+class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE):
+ IE_NAME = 'netease:djradio'
+ IE_DESC = '网易云音乐 - 电台'
+ _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://music.163.com/#/djradio?id=42',
+ 'info_dict': {
+ 'id': '42',
+ 'title': '声音蔓延',
+ 'description': 'md5:766220985cbd16fdd552f64c578a6b15'
+ },
+ 'playlist_mincount': 40,
+ }
+ _PAGE_SIZE = 1000
+
+ def _real_extract(self, url):
+ dj_id = self._match_id(url)
+
+ name = None
+ desc = None
+ entries = []
+ for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE):
+ info = self.query_api(
+ 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d'
+ % (self._PAGE_SIZE, dj_id, offset),
+ dj_id, 'Downloading dj programs - %d' % offset)
+
+ entries.extend([
+ self.url_result(
+ 'http://music.163.com/#/program?id=%s' % program['id'],
+ 'NetEaseMusicProgram', program['id'])
+ for program in info['programs']
+ ])
+
+ if name is None:
+ radio = info['programs'][0]['radio']
+ name = radio['name']
+ desc = radio['desc']
+
+ if not info['more']:
+ break
+
+ return self.playlist_result(entries, dj_id, name, desc)
diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py
index 93567d1e3..0d165a82a 100644
--- a/youtube_dl/extractor/netzkino.py
+++ b/youtube_dl/extractor/netzkino.py
@@ -29,6 +29,9 @@ class NetzkinoIE(InfoExtractor):
'timestamp': 1344858571,
'age_limit': 12,
},
+ 'params': {
+ 'skip_download': 'Download only works from Germany',
+ }
}
def _real_extract(self, url):
@@ -46,7 +49,7 @@ class NetzkinoIE(InfoExtractor):
'http://www.netzkino.de/beta/dist/production.min.js', video_id,
note='Downloading player code')
avo_js = self._search_regex(
- r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
+ r'var urlTemplate=(\{.*?"\})',
production_js, 'URL templates')
templates = self._parse_json(
avo_js, video_id, transform_source=js_to_json)
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py
index 85fcad06b..5a9e73cd6 100644
--- a/youtube_dl/extractor/newstube.py
+++ b/youtube_dl/extractor/newstube.py
@@ -31,7 +31,7 @@ class NewstubeIE(InfoExtractor):
page = self._download_webpage(url, video_id, 'Downloading page')
video_guid = self._html_search_regex(
- r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+ r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
page, 'video GUID')
player = self._download_xml(
diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py
index 02dba4ef6..d1688457f 100644
--- a/youtube_dl/extractor/nextmedia.py
+++ b/youtube_dl/extractor/nextmedia.py
@@ -6,6 +6,7 @@ from ..utils import parse_iso8601
class NextMediaIE(InfoExtractor):
+ IE_DESC = '蘋果日報'
_VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
@@ -66,6 +67,7 @@ class NextMediaIE(InfoExtractor):
class NextMediaActionNewsIE(NextMediaIE):
+ IE_DESC = '蘋果日報 - 動新聞'
_VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
_TESTS = [{
'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
@@ -89,8 +91,9 @@ class NextMediaActionNewsIE(NextMediaIE):
return self._extract_from_nextmedia_page(news_id, url, article_page)
-class AppleDailyRealtimeNewsIE(NextMediaIE):
- _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+ IE_DESC = '臺灣蘋果日報'
+ _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
_TESTS = [{
'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +102,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
'ext': 'mp4',
'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'md5:b23787119933404ce515c6356a8c355c',
+ 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
'upload_date': '20150128',
}
}, {
@@ -110,26 +113,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
'ext': 'mp4',
'title': '不滿被踩腳 山東兩大媽一路打下車',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+ 'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
'upload_date': '20150128',
}
- }]
-
- _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
- def _fetch_title(self, page):
- return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
- def _fetch_thumbnail(self, page):
- return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
- def _fetch_timestamp(self, page):
- return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
- _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
- _TESTS = [{
+ }, {
'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
'md5': '03df296d95dedc2d5886debbb80cb43f',
'info_dict': {
@@ -139,7 +126,8 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
'thumbnail': 're:^https?://.*\.jpg$',
'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd',
'upload_date': '20150128',
- }
+ },
+ 'skip': 'redirect to http://www.appledaily.com.tw/animation/',
}, {
# No thumbnail
'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/',
@@ -153,11 +141,32 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
},
'expected_warnings': [
'video thumbnail',
- ]
+ ],
+ 'skip': 'redirect to http://www.appledaily.com.tw/animation/',
+ }, {
+ 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+ 'md5': 'eaa20e6b9df418c912d7f5dec2ba734d',
+ 'info_dict': {
+ 'id': '35770334',
+ 'ext': 'mp4',
+ 'title': '咖啡占卜測 XU裝熟指數',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748',
+ 'upload_date': '20140417',
+ },
}]
+ _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
def _fetch_title(self, page):
- return self._html_search_meta('description', page, 'news title')
+ return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+ self._html_search_meta('description', page, 'news title'))
+
+ def _fetch_thumbnail(self, page):
+ return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+ def _fetch_timestamp(self, page):
+ return None
def _fetch_description(self, page):
return self._html_search_meta('description', page, 'news description')
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py
index ea077254b..5bd15f7a7 100644
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@@ -1,10 +1,8 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse
+from ..utils import sanitized_Request
class NFBIE(InfoExtractor):
@@ -40,8 +38,9 @@ class NFBIE(InfoExtractor):
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
page, 'director name', fatal=False)
- request = compat_urllib_request.Request('https://www.nfb.ca/film/%s/player_config' % video_id,
- compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
+ request = sanitized_Request(
+ 'https://www.nfb.ca/film/%s/player_config' % video_id,
+ compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py
index 2684dd250..200874d68 100644
--- a/youtube_dl/extractor/nfl.py
+++ b/youtube_dl/extractor/nfl.py
@@ -16,49 +16,118 @@ from ..utils import (
class NFLIE(InfoExtractor):
IE_NAME = 'nfl.com'
- _VALID_URL = r'''(?x)https?://
- (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/
- (?:.+?/)*
- (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))'''
- _TESTS = [
- {
- 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
- 'md5': '394ef771ddcd1354f665b471d78ec4c6',
- 'info_dict': {
- 'id': '0ap3000000398478',
- 'ext': 'mp4',
- 'title': 'Week 3: Redskins vs. Eagles highlights',
- 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
- 'upload_date': '20140921',
- 'timestamp': 1411337580,
- 'thumbnail': 're:^https?://.*\.jpg$',
- }
+ _VALID_URL = r'''(?x)
+ https?://
+ (?P<host>
+ (?:www\.)?
+ (?:
+ (?:
+ nfl|
+ buffalobills|
+ miamidolphins|
+ patriots|
+ newyorkjets|
+ baltimoreravens|
+ bengals|
+ clevelandbrowns|
+ steelers|
+ houstontexans|
+ colts|
+ jaguars|
+ titansonline|
+ denverbroncos|
+ kcchiefs|
+ raiders|
+ chargers|
+ dallascowboys|
+ giants|
+ philadelphiaeagles|
+ redskins|
+ chicagobears|
+ detroitlions|
+ packers|
+ vikings|
+ atlantafalcons|
+ panthers|
+ neworleanssaints|
+ buccaneers|
+ azcardinals|
+ stlouisrams|
+ 49ers|
+ seahawks
+ )\.com|
+ .+?\.clubs\.nfl\.com
+ )
+ )/
+ (?:.+?/)*
+ (?P<id>[^/#?&]+)
+ '''
+ _TESTS = [{
+ 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+ 'md5': '394ef771ddcd1354f665b471d78ec4c6',
+ 'info_dict': {
+ 'id': '0ap3000000398478',
+ 'ext': 'mp4',
+ 'title': 'Week 3: Redskins vs. Eagles highlights',
+ 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+ 'upload_date': '20140921',
+ 'timestamp': 1411337580,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+ 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+ 'info_dict': {
+ 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+ 'ext': 'mp4',
+ 'title': 'LIVE: Post Game vs. Browns',
+ 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+ 'upload_date': '20131229',
+ 'timestamp': 1388354455,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',
+ 'info_dict': {
+ 'id': '0ap3000000467607',
+ 'ext': 'mp4',
+ 'title': 'Frustrations flare on the field',
+ 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',
+ 'timestamp': 1422850320,
+ 'upload_date': '20150202',
+ },
+ }, {
+ 'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette',
+ 'md5': '4c319e2f625ffd0b481b4382c6fc124c',
+ 'info_dict': {
+ 'id': 'n-238346',
+ 'ext': 'mp4',
+ 'title': '10 Days at Gillette',
+ 'description': 'md5:8cd9cd48fac16de596eadc0b24add951',
+ 'timestamp': 1442618809,
+ 'upload_date': '20150918',
},
- {
- 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
- 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
- 'info_dict': {
- 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
- 'ext': 'mp4',
- 'title': 'LIVE: Post Game vs. Browns',
- 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
- 'upload_date': '20131229',
- 'timestamp': 1388354455,
- 'thumbnail': 're:^https?://.*\.jpg$',
- }
+ }, {
+ # lowercase data-contentid
+ 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7',
+ 'info_dict': {
+ 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2',
+ 'ext': 'mp4',
+ 'title': 'Tomlin looks ahead to Ravens on a short week',
+ 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75',
+ 'timestamp': 1443459651,
+ 'upload_date': '20150928',
},
- {
- 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',
- 'info_dict': {
- 'id': '0ap3000000467607',
- 'ext': 'mp4',
- 'title': 'Frustrations flare on the field',
- 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',
- 'timestamp': 1422850320,
- 'upload_date': '20150202',
- },
+ 'params': {
+ 'skip_download': True,
},
- ]
+ }, {
+ 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a',
+ 'only_matching': True,
+ }]
@staticmethod
def prepend_host(host, url):
@@ -91,13 +160,14 @@ class NFLIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config_url = NFLIE.prepend_host(host, self._search_regex(
- r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL',
- default='static/content/static/config/video/config.json'))
+ r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1',
+ webpage, 'config URL', default='static/content/static/config/video/config.json',
+ group='config'))
# For articles, the id in the url is not the video id
video_id = self._search_regex(
- r'contentId\s*:\s*"([^"]+)"', webpage, 'video id', default=video_id)
- config = self._download_json(config_url, video_id,
- note='Downloading player config')
+ r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>.+?)\1',
+ webpage, 'video id', default=video_id, group='id')
+ config = self._download_json(config_url, video_id, 'Downloading player config')
url_template = NFLIE.prepend_host(
host, '{contentURLTemplate:}'.format(**config))
video_data = self._download_json(
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 407465998..e98a5ef89 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -21,6 +21,9 @@ class NHLBaseInfoExtractor(InfoExtractor):
return json_string.replace('\\\'', '\'')
def _real_extract_video(self, video_id):
+ vid_parts = video_id.split(',')
+ if len(vid_parts) == 3:
+ video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0'))
json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id
data = self._download_json(
json_url, video_id, transform_source=self._fix_json)
@@ -47,7 +50,7 @@ class NHLBaseInfoExtractor(InfoExtractor):
video_url = initial_video_url
join = compat_urlparse.urljoin
- return {
+ ret = {
'id': video_id,
'title': info['name'],
'url': video_url,
@@ -56,11 +59,20 @@ class NHLBaseInfoExtractor(InfoExtractor):
'thumbnail': join(join(video_url, '/u/'), info['bigImage']),
'upload_date': unified_strdate(info['releaseDate'].split('.')[0]),
}
+ if video_url.startswith('rtmp:'):
+ mobj = re.match(r'(?P<tc_url>rtmp://[^/]+/(?P<app>[a-z0-9/]+))/(?P<play_path>mp4:.*)', video_url)
+ ret.update({
+ 'tc_url': mobj.group('tc_url'),
+ 'play_path': mobj.group('play_path'),
+ 'app': mobj.group('app'),
+ 'no_resume': True,
+ })
+ return ret
class NHLIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com'
- _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
+ _VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P<id>[-0-9a-zA-Z,]+)'
_TESTS = [{
'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614',
@@ -101,6 +113,32 @@ class NHLIE(NHLBaseInfoExtractor):
}, {
'url': 'http://video.nhl.com/videocenter/?id=736722',
'only_matching': True,
+ }, {
+ 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en',
+ 'md5': '076fcb88c255154aacbf0a7accc3f340',
+ 'info_dict': {
+ 'id': '2014020299-X-h',
+ 'ext': 'mp4',
+ 'title': 'Penguins at Islanders / Game Highlights',
+ 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014',
+ 'duration': 268,
+ 'upload_date': '20141122',
+ }
+ }, {
+ 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4',
+ 'info_dict': {
+ 'id': '691469',
+ 'ext': 'mp4',
+ 'title': 'RAW | Craig MacTavish Full Press Conference',
+ 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.',
+ 'upload_date': '20141205',
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ }
+ }, {
+ 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -111,9 +149,9 @@ class NHLIE(NHLBaseInfoExtractor):
class NHLNewsIE(NHLBaseInfoExtractor):
IE_NAME = 'nhl.com:news'
IE_DESC = 'NHL news'
- _VALID_URL = r'https?://(?:www\.)?nhl\.com/ice/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
+ _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P<id>[-0-9a-zA-Z]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.nhl.com/ice/news.htm?id=750727',
'md5': '4b3d1262e177687a3009937bd9ec0be8',
'info_dict': {
@@ -124,13 +162,26 @@ class NHLNewsIE(NHLBaseInfoExtractor):
'duration': 37,
'upload_date': '20150128',
},
- }
+ }, {
+ # iframe embed
+ 'url': 'http://sabres.nhl.com/club/news.htm?id=780189',
+ 'md5': '9f663d1c006c90ac9fb82777d4294e12',
+ 'info_dict': {
+ 'id': '836127',
+ 'ext': 'mp4',
+ 'title': 'Morning Skate: OTT vs. BUF (9/23/15)',
+ 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.",
+ 'duration': 93,
+ 'upload_date': '20150923',
+ },
+ }]
def _real_extract(self, url):
news_id = self._match_id(url)
webpage = self._download_webpage(url, news_id)
video_id = self._search_regex(
- [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'"],
+ [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'",
+ r'<iframe[^>]+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'],
webpage, 'video id')
return self._real_extract_video(video_id)
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 4c1890416..586e52a4a 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -3,18 +3,22 @@ from __future__ import unicode_literals
import re
import json
+import datetime
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
- compat_urllib_request,
compat_urlparse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
int_or_none,
parse_duration,
- unified_strdate,
+ parse_iso8601,
+ sanitized_Request,
+ xpath_text,
+ determine_ext,
)
@@ -22,7 +26,7 @@ class NiconicoIE(InfoExtractor):
IE_NAME = 'niconico'
IE_DESC = 'ニコニコ動画'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215',
'md5': 'd1a75c0823e2f629128c43e1212760f9',
'info_dict': {
@@ -32,16 +36,53 @@ class NiconicoIE(InfoExtractor):
'uploader': 'takuya0301',
'uploader_id': '2698420',
'upload_date': '20131123',
+ 'timestamp': 1385182762,
'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
'duration': 33,
},
- 'params': {
- 'username': 'ydl.niconico@gmail.com',
- 'password': 'youtube-dl',
+ }, {
+ # File downloaded with and without credentials are different, so omit
+ # the md5 field
+ 'url': 'http://www.nicovideo.jp/watch/nm14296458',
+ 'info_dict': {
+ 'id': 'nm14296458',
+ 'ext': 'swf',
+ 'title': '【鏡音リン】Dance on media【オリジナル】take2!',
+ 'description': 'md5:689f066d74610b3b22e0f1739add0f58',
+ 'uploader': 'りょうた',
+ 'uploader_id': '18822557',
+ 'upload_date': '20110429',
+ 'timestamp': 1304065916,
+ 'duration': 209,
},
- }
+ }, {
+ # 'video exists but is marked as "deleted"
+ # md5 is unstable
+ 'url': 'http://www.nicovideo.jp/watch/sm10000',
+ 'info_dict': {
+ 'id': 'sm10000',
+ 'ext': 'unknown_video',
+ 'description': 'deleted',
+ 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>',
+ 'upload_date': '20071224',
+ 'timestamp': 1198527840, # timestamp field has different value if logged in
+ 'duration': 304,
+ },
+ }, {
+ 'url': 'http://www.nicovideo.jp/watch/so22543406',
+ 'info_dict': {
+ 'id': '1388129933',
+ 'ext': 'mp4',
+ 'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~',
+ 'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1',
+ 'timestamp': 1388851200,
+ 'upload_date': '20140104',
+ 'uploader': 'アニメロチャンネル',
+ 'uploader_id': '312',
+ }
+ }]
- _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/((?:[a-z]{2})?[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico'
# Determine whether the downloader used authentication to download video
_AUTHENTICATED = False
@@ -60,11 +101,8 @@ class NiconicoIE(InfoExtractor):
'mail': username,
'password': password,
}
- # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
- # chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
- login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
- request = compat_urllib_request.Request(
+ login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8')
+ request = sanitized_Request(
'https://secure.nicovideo.jp/secure/login', login_data)
login_results = self._download_webpage(
request, None, note='Logging in', errnote='Unable to log in')
@@ -76,12 +114,15 @@ class NiconicoIE(InfoExtractor):
return True
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = self._match_id(url)
- # Get video webpage. We are not actually interested in it, but need
- # the cookies in order to be able to download the info webpage
- self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
+ # Get video webpage. We are not actually interested in it for normal
+ # cases, but need the cookies in order to be able to download the
+ # info webpage
+ webpage, handle = self._download_webpage_handle(
+ 'http://www.nicovideo.jp/watch/' + video_id, video_id)
+ if video_id.startswith('so'):
+ video_id = self._match_id(handle.geturl())
video_info = self._download_xml(
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
@@ -90,7 +131,7 @@ class NiconicoIE(InfoExtractor):
if self._AUTHENTICATED:
# Get flv info
flv_info_webpage = self._download_webpage(
- 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+ 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
video_id, 'Downloading flv info')
else:
# Get external player info
@@ -104,29 +145,84 @@ class NiconicoIE(InfoExtractor):
'k': thumb_play_key,
'v': video_id
})
- flv_info_request = compat_urllib_request.Request(
+ flv_info_request = sanitized_Request(
'http://ext.nicovideo.jp/thumb_watch', flv_info_data,
{'Content-Type': 'application/x-www-form-urlencoded'})
flv_info_webpage = self._download_webpage(
flv_info_request, video_id,
note='Downloading flv info', errnote='Unable to download flv info')
- if 'deleted=' in flv_info_webpage:
- raise ExtractorError('The video has been deleted.',
- expected=True)
- video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
+ flv_info = compat_urlparse.parse_qs(flv_info_webpage)
+ if 'url' not in flv_info:
+ if 'deleted' in flv_info:
+ raise ExtractorError('The video has been deleted.',
+ expected=True)
+ else:
+ raise ExtractorError('Unable to find video URL')
+
+ video_real_url = flv_info['url'][0]
# Start extracting information
- title = video_info.find('.//title').text
- extension = video_info.find('.//movie_type').text
- video_format = extension.upper()
- thumbnail = video_info.find('.//thumbnail_url').text
- description = video_info.find('.//description').text
- upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
- view_count = int_or_none(video_info.find('.//view_counter').text)
- comment_count = int_or_none(video_info.find('.//comment_num').text)
- duration = parse_duration(video_info.find('.//length').text)
- webpage_url = video_info.find('.//watch_url').text
+ title = xpath_text(video_info, './/title')
+ if not title:
+ title = self._og_search_title(webpage, default=None)
+ if not title:
+ title = self._html_search_regex(
+ r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
+ webpage, 'video title')
+
+ watch_api_data_string = self._html_search_regex(
+ r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
+ webpage, 'watch api data', default=None)
+ watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
+ video_detail = watch_api_data.get('videoDetail', {})
+
+ extension = xpath_text(video_info, './/movie_type')
+ if not extension:
+ extension = determine_ext(video_real_url)
+
+ thumbnail = (
+ xpath_text(video_info, './/thumbnail_url') or
+ self._html_search_meta('image', webpage, 'thumbnail', default=None) or
+ video_detail.get('thumbnail'))
+
+ description = xpath_text(video_info, './/description')
+
+ timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve'))
+ if not timestamp:
+ match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
+ if match:
+ timestamp = parse_iso8601(match.replace('+', ':00+'))
+ if not timestamp and video_detail.get('postedAt'):
+ timestamp = parse_iso8601(
+ video_detail['postedAt'].replace('/', '-'),
+ delimiter=' ', timezone=datetime.timedelta(hours=9))
+
+ view_count = int_or_none(xpath_text(video_info, './/view_counter'))
+ if not view_count:
+ match = self._html_search_regex(
+ r'>Views: <strong[^>]*>([^<]+)</strong>',
+ webpage, 'view count', default=None)
+ if match:
+ view_count = int_or_none(match.replace(',', ''))
+ view_count = view_count or video_detail.get('viewCount')
+
+ comment_count = int_or_none(xpath_text(video_info, './/comment_num'))
+ if not comment_count:
+ match = self._html_search_regex(
+ r'>Comments: <strong[^>]*>([^<]+)</strong>',
+ webpage, 'comment count', default=None)
+ if match:
+ comment_count = int_or_none(match.replace(',', ''))
+ comment_count = comment_count or video_detail.get('commentCount')
+
+ duration = (parse_duration(
+ xpath_text(video_info, './/length') or
+ self._html_search_meta(
+ 'video:duration', webpage, 'video duration', default=None)) or
+ video_detail.get('length'))
+
+ webpage_url = xpath_text(video_info, './/watch_url') or url
if video_info.find('.//ch_id') is not None:
uploader_id = video_info.find('.//ch_id').text
@@ -142,11 +238,11 @@ class NiconicoIE(InfoExtractor):
'url': video_real_url,
'title': title,
'ext': extension,
- 'format': video_format,
+ 'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
'thumbnail': thumbnail,
'description': description,
'uploader': uploader,
- 'upload_date': upload_date,
+ 'timestamp': timestamp,
'uploader_id': uploader_id,
'view_count': view_count,
'comment_count': comment_count,
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index 7f842b5c2..a06d38afd 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import str_to_int
@@ -9,61 +8,93 @@ from ..utils import str_to_int
class NineGagIE(InfoExtractor):
IE_NAME = '9gag'
- _VALID_URL = r'''(?x)^https?://(?:www\.)?9gag\.tv/
- (?:
- v/(?P<numid>[0-9]+)|
- p/(?P<id>[a-zA-Z0-9]+)/(?P<display_id>[^?#/]+)
- )
- '''
+ _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?'
_TESTS = [{
- "url": "http://9gag.tv/v/1912",
- "info_dict": {
- "id": "1912",
- "ext": "mp4",
- "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
- "title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
+ 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome',
+ 'info_dict': {
+ 'id': 'Kk2X5',
+ 'ext': 'mp4',
+ 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)',
+ 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome',
'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',
'uploader': 'CompilationChannel',
'upload_date': '20131110',
- "view_count": int,
- "thumbnail": "re:^https?://",
+ 'view_count': int,
},
- 'add_ie': ['Youtube']
+ 'add_ie': ['Youtube'],
}, {
- 'url': 'http://9gag.tv/p/KklwM/alternate-banned-opening-scene-of-gravity?ref=fsidebar',
+ 'url': 'http://9gag.com/tv/p/aKolP3',
'info_dict': {
- 'id': 'KklwM',
+ 'id': 'aKolP3',
'ext': 'mp4',
- 'display_id': 'alternate-banned-opening-scene-of-gravity',
- "description": "While Gravity was a pretty awesome movie already, YouTuber Krishna Shenoi came up with a way to improve upon it, introducing a much better solution to Sandra Bullock's seemingly endless tumble in space. The ending is priceless.",
- 'title': "Banned Opening Scene Of \"Gravity\" That Changes The Whole Movie",
- 'uploader': 'Krishna Shenoi',
- 'upload_date': '20140401',
- 'uploader_id': 'krishnashenoi93',
+ 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video',
+ 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!",
+ 'uploader_id': 'rickmereki',
+ 'uploader': 'Rick Mereki',
+ 'upload_date': '20110803',
+ 'view_count': int,
},
+ 'add_ie': ['Vimeo'],
+ }, {
+ 'url': 'http://9gag.com/tv/p/KklwM',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://9gag.tv/p/Kk2X5',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://9gag.com/tv/embed/a5Dmvl',
+ 'only_matching': True,
}]
+ _EXTERNAL_VIDEO_PROVIDER = {
+ '1': {
+ 'url': '%s',
+ 'ie_key': 'Youtube',
+ },
+ '2': {
+ 'url': 'http://player.vimeo.com/video/%s',
+ 'ie_key': 'Vimeo',
+ },
+ '3': {
+ 'url': 'http://instagram.com/p/%s',
+ 'ie_key': 'Instagram',
+ },
+ '4': {
+ 'url': 'http://vine.co/v/%s',
+ 'ie_key': 'Vine',
+ },
+ }
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('numid') or mobj.group('id')
+ video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
webpage = self._download_webpage(url, display_id)
- post_view = json.loads(self._html_search_regex(
- r'var postView = new app\.PostView\({\s*post:\s*({.+?}),\s*posts:\s*prefetchedCurrentPost', webpage, 'post view'))
+ post_view = self._parse_json(
+ self._search_regex(
+ r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost',
+ webpage, 'post view'),
+ display_id)
- youtube_id = post_view['videoExternalId']
+ ie_key = None
+ source_url = post_view.get('sourceUrl')
+ if not source_url:
+ external_video_id = post_view['videoExternalId']
+ external_video_provider = post_view['videoExternalProvider']
+ source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id
+ ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key']
title = post_view['title']
- description = post_view['description']
- view_count = str_to_int(post_view['externalView'])
+ description = post_view.get('description')
+ view_count = str_to_int(post_view.get('externalView'))
thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
return {
'_type': 'url_transparent',
- 'url': youtube_id,
- 'ie_key': 'Youtube',
+ 'url': source_url,
+ 'ie_key': ie_key,
'id': video_id,
'display_id': display_id,
'title': title,
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index 251e6da07..76bd21e6d 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -9,12 +9,14 @@ from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urllib_parse,
- compat_urllib_request,
)
from ..utils import (
clean_html,
ExtractorError,
- unified_strdate,
+ int_or_none,
+ float_or_none,
+ parse_iso8601,
+ sanitized_Request,
)
@@ -25,21 +27,38 @@ class NocoIE(InfoExtractor):
_SUB_LANG_TEMPLATE = '&sub_lang=%s'
_NETRC_MACHINE = 'noco'
- _TEST = {
- 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
- 'md5': '0a993f0058ddbcd902630b2047ef710e',
- 'info_dict': {
- 'id': '11538',
- 'ext': 'mp4',
- 'title': 'Ami Ami Idol - Hello! France',
- 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
- 'upload_date': '20140412',
- 'uploader': 'Nolife',
- 'uploader_id': 'NOL',
- 'duration': 2851.2,
+ _TESTS = [
+ {
+ 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
+ 'md5': '0a993f0058ddbcd902630b2047ef710e',
+ 'info_dict': {
+ 'id': '11538',
+ 'ext': 'mp4',
+ 'title': 'Ami Ami Idol - Hello! France',
+ 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
+ 'upload_date': '20140412',
+ 'uploader': 'Nolife',
+ 'uploader_id': 'NOL',
+ 'duration': 2851.2,
+ },
+ 'skip': 'Requires noco account',
},
- 'skip': 'Requires noco account',
- }
+ {
+ 'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call',
+ 'md5': 'c190f1f48e313c55838f1f412225934d',
+ 'info_dict': {
+ 'id': '12610',
+ 'ext': 'mp4',
+ 'title': 'The Guild #1 - Wake-Up Call',
+ 'timestamp': 1403863200,
+ 'upload_date': '20140627',
+ 'uploader': 'LBL42',
+ 'uploader_id': 'LBL',
+ 'duration': 233.023,
+ },
+ 'skip': 'Requires noco account',
+ }
+ ]
def _real_initialize(self):
self._login()
@@ -55,7 +74,7 @@ class NocoIE(InfoExtractor):
'username': username,
'password': password,
}
- request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+ request = sanitized_Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
login = self._download_json(request, None, 'Logging in as %s' % username)
@@ -90,51 +109,70 @@ class NocoIE(InfoExtractor):
'shows/%s/medias' % video_id,
video_id, 'Downloading video JSON')
+ show = self._call_api(
+ 'shows/by_id/%s' % video_id,
+ video_id, 'Downloading show JSON')[0]
+
+ options = self._call_api(
+ 'users/init', video_id,
+ 'Downloading user options JSON')['options']
+ audio_lang_pref = options.get('audio_language') or options.get('language', 'fr')
+
+ if audio_lang_pref == 'original':
+ audio_lang_pref = show['original_lang']
+ if len(medias) == 1:
+ audio_lang_pref = list(medias.keys())[0]
+ elif audio_lang_pref not in medias:
+ audio_lang_pref = 'fr'
+
qualities = self._call_api(
'qualities',
video_id, 'Downloading qualities JSON')
formats = []
- for lang, lang_dict in medias['fr']['video_list'].items():
- for format_id, fmt in lang_dict['quality_list'].items():
- format_id_extended = '%s-%s' % (lang, format_id) if lang != 'none' else format_id
-
- video = self._call_api(
- 'shows/%s/video/%s/fr' % (video_id, format_id.lower()),
- video_id, 'Downloading %s video JSON' % format_id_extended,
- lang if lang != 'none' else None)
-
- file_url = video['file']
- if not file_url:
- continue
-
- if file_url in ['forbidden', 'not found']:
- popmessage = video['popmessage']
- self._raise_error(popmessage['title'], popmessage['message'])
-
- formats.append({
- 'url': file_url,
- 'format_id': format_id_extended,
- 'width': fmt['res_width'],
- 'height': fmt['res_lines'],
- 'abr': fmt['audiobitrate'],
- 'vbr': fmt['videobitrate'],
- 'filesize': fmt['filesize'],
- 'format_note': qualities[format_id]['quality_name'],
- 'preference': qualities[format_id]['priority'],
- })
+ for audio_lang, audio_lang_dict in medias.items():
+ preference = 1 if audio_lang == audio_lang_pref else 0
+ for sub_lang, lang_dict in audio_lang_dict['video_list'].items():
+ for format_id, fmt in lang_dict['quality_list'].items():
+ format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id)
+
+ video = self._call_api(
+ 'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang),
+ video_id, 'Downloading %s video JSON' % format_id_extended,
+ sub_lang if sub_lang != 'none' else None)
+
+ file_url = video['file']
+ if not file_url:
+ continue
+
+ if file_url in ['forbidden', 'not found']:
+ popmessage = video['popmessage']
+ self._raise_error(popmessage['title'], popmessage['message'])
+
+ formats.append({
+ 'url': file_url,
+ 'format_id': format_id_extended,
+ 'width': int_or_none(fmt.get('res_width')),
+ 'height': int_or_none(fmt.get('res_lines')),
+ 'abr': int_or_none(fmt.get('audiobitrate')),
+ 'vbr': int_or_none(fmt.get('videobitrate')),
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'format_note': qualities[format_id].get('quality_name'),
+ 'quality': qualities[format_id].get('priority'),
+ 'preference': preference,
+ })
self._sort_formats(formats)
- show = self._call_api(
- 'shows/by_id/%s' % video_id,
- video_id, 'Downloading show JSON')[0]
+ timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+ if timestamp is not None and timestamp < 0:
+ timestamp = None
- upload_date = unified_strdate(show['online_date_start_utc'])
- uploader = show['partner_name']
- uploader_id = show['partner_key']
- duration = show['duration_ms'] / 1000.0
+ uploader = show.get('partner_name')
+ uploader_id = show.get('partner_key')
+ duration = float_or_none(show.get('duration_ms'), 1000)
thumbnails = []
for thumbnail_key, thumbnail_url in show.items():
@@ -157,7 +195,7 @@ class NocoIE(InfoExtractor):
if episode_number:
title += ' #' + compat_str(episode_number)
if episode:
- title += ' - ' + episode
+ title += ' - ' + compat_str(episode)
description = show.get('show_resume') or show.get('family_resume')
@@ -166,7 +204,7 @@ class NocoIE(InfoExtractor):
'title': title,
'description': description,
'thumbnails': thumbnails,
- 'upload_date': upload_date,
+ 'timestamp': timestamp,
'uploader': uploader,
'uploader_id': uploader_id,
'duration': duration,
diff --git a/youtube_dl/extractor/nosvideo.py b/youtube_dl/extractor/nosvideo.py
index f5ef856db..eab816e49 100644
--- a/youtube_dl/extractor/nosvideo.py
+++ b/youtube_dl/extractor/nosvideo.py
@@ -4,11 +4,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
ExtractorError,
+ sanitized_Request,
urlencode_postdata,
xpath_text,
xpath_with_ns,
@@ -41,7 +39,7 @@ class NosVideoIE(InfoExtractor):
'op': 'download1',
'method_free': 'Continue to Video',
}
- req = compat_urllib_request.Request(url, urlencode_postdata(fields))
+ req = sanitized_Request(url, urlencode_postdata(fields))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
webpage = self._download_webpage(req, video_id,
'Downloading download page')
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
new file mode 100644
index 000000000..3f9c776ef
--- /dev/null
+++ b/youtube_dl/extractor/nova.py
@@ -0,0 +1,179 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ unified_strdate,
+)
+
+
+class NovaIE(InfoExtractor):
+ IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+ _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+ _TESTS = [{
+ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
+ 'info_dict': {
+ 'id': '1608920',
+ 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
+ 'ext': 'flv',
+ 'title': 'Duel: Michal Hrdlička a Petr Suchoň',
+ 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+ 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+ 'info_dict': {
+ 'id': '1757139',
+ 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+ 'ext': 'mp4',
+ 'title': 'Podzemní nemocnice v pražské Krči',
+ 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'info_dict': {
+ 'id': '1756825',
+ 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
+ 'ext': 'flv',
+ 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
+ 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
+ 'info_dict': {
+ 'id': '1756858',
+ 'ext': 'flv',
+ 'title': 'Televizní noviny - 30. 5. 2015',
+ 'thumbnail': 're:^https?://.*\.(?:jpg)',
+ 'upload_date': '20150530',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'info_dict': {
+ 'id': '1753621',
+ 'ext': 'mp4',
+ 'title': 'Zaklínač 3: Divoký hon',
+ 'description': 're:.*Pokud se stejně jako my nemůžete.*',
+ 'thumbnail': 're:https?://.*\.jpg(\?.*)?',
+ 'upload_date': '20150521',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ site = mobj.group('site')
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ [r"(?:media|video_id)\s*:\s*'(\d+)'",
+ r'media=(\d+)',
+ r'id="article_video_(\d+)"',
+ r'id="player_(\d+)"'],
+ webpage, 'video id')
+
+ config_url = self._search_regex(
+ r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+ webpage, 'config url', default=None)
+
+ if not config_url:
+ DEFAULT_SITE_ID = '23000'
+ SITES = {
+ 'tvnoviny': DEFAULT_SITE_ID,
+ 'novaplus': DEFAULT_SITE_ID,
+ 'vymena': DEFAULT_SITE_ID,
+ 'krasna': DEFAULT_SITE_ID,
+ 'fanda': '30',
+ 'tn': '30',
+ 'doma': '30',
+ }
+
+ site_id = self._search_regex(
+ r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+
+ config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
+ % (site_id, video_id))
+
+ config = self._download_json(
+ config_url, display_id,
+ 'Downloading config JSON',
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+ mediafile = config['mediafile']
+ video_url = mediafile['src']
+
+ m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+ if m:
+ formats = [{
+ 'url': m.group('url'),
+ 'app': m.group('app'),
+ 'play_path': m.group('playpath'),
+ 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+ 'ext': 'flv',
+ }]
+ else:
+ formats = [{
+ 'url': video_url,
+ }]
+ self._sort_formats(formats)
+
+ title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+ description = clean_html(self._og_search_description(webpage, default=None))
+ thumbnail = config.get('poster')
+
+ if site == 'novaplus':
+ upload_date = unified_strdate(self._search_regex(
+ r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+ elif site == 'fanda':
+ upload_date = unified_strdate(self._search_regex(
+ r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+ else:
+ upload_date = None
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py
index 04d779890..6163e8855 100644
--- a/youtube_dl/extractor/novamov.py
+++ b/youtube_dl/extractor/novamov.py
@@ -3,11 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
+ NO_DEFAULT,
+ encode_dict,
+ sanitized_Request,
+ urlencode_postdata,
)
@@ -38,19 +40,40 @@ class NovaMovIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
- page = self._download_webpage(
- 'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page')
+ url = 'http://%s/video/%s' % (self._HOST, video_id)
- if re.search(self._FILE_DELETED_REGEX, page) is not None:
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+ webpage = self._download_webpage(
+ url, video_id, 'Downloading video page')
- filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey')
+ if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
- description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False)
+ def extract_filekey(default=NO_DEFAULT):
+ return self._search_regex(
+ self._FILEKEY_REGEX, webpage, 'filekey', default=default)
+
+ filekey = extract_filekey(default=None)
+
+ if not filekey:
+ fields = self._hidden_inputs(webpage)
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', webpage,
+ 'post url', default=url, group='url')
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(url, post_url)
+ request = sanitized_Request(
+ post_url, urlencode_postdata(encode_dict(fields)))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ request.add_header('Referer', post_url)
+ webpage = self._download_webpage(
+ request, video_id, 'Downloading continue to the video page')
+
+ filekey = extract_filekey()
+
+ title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title', fatal=False)
+ description = self._html_search_regex(self._DESCRIPTION_REGEX, webpage, 'description', default='', fatal=False)
api_response = self._download_webpage(
'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id,
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py
index 6b2f3f55a..d480fb58c 100644
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@@ -1,64 +1,134 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
-from .brightcove import BrightcoveIE
+from .brightcove import BrightcoveLegacyIE
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ sanitized_Request,
+)
+
+
+class NownessBaseIE(InfoExtractor):
+ def _extract_url_result(self, post):
+ if post['type'] == 'video':
+ for media in post['media']:
+ if media['type'] == 'video':
+ video_id = media['content']
+ source = media['source']
+ if source == 'brightcove':
+ player_code = self._download_webpage(
+ 'http://www.nowness.com/iframe?id=%s' % video_id, video_id,
+ note='Downloading player JavaScript',
+ errnote='Unable to download player JavaScript')
+ bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)
+ if bc_url is None:
+ raise ExtractorError('Could not find player definition')
+ return self.url_result(bc_url, 'BrightcoveLegacy')
+ elif source == 'vimeo':
+ return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
+ elif source == 'youtube':
+ return self.url_result(video_id, 'Youtube')
+ elif source == 'cinematique':
+ # youtube-dl currently doesn't support cinematique
+ # return self.url_result('http://cinematique.com/embed/%s' % video_id, 'Cinematique')
+ pass
+ def _api_request(self, url, request_path):
+ display_id = self._match_id(url)
+ request = sanitized_Request(
+ 'http://api.nowness.com/api/' + request_path % display_id,
+ headers={
+ 'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us',
+ })
+ return display_id, self._download_json(request, display_id)
-class NownessIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/[^?#]*?/(?P<id>[0-9]+)/(?P<slug>[^/]+?)(?:$|[?#])'
- _TESTS = [
- {
- 'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation',
- 'md5': '068bc0202558c2e391924cb8cc470676',
- 'info_dict': {
- 'id': '2520295746001',
- 'ext': 'mp4',
- 'title': 'Candor: The Art of Gesticulation',
- 'description': 'Candor: The Art of Gesticulation',
- 'thumbnail': 're:^https?://.*\.jpg',
- 'uploader': 'Nowness',
- }
+class NownessIE(NownessBaseIE):
+ IE_NAME = 'nowness'
+ _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/(?:story|(?:series|category)/[^/]+)/(?P<id>[^/]+?)(?:$|[?#])'
+ _TESTS = [{
+ 'url': 'https://www.nowness.com/story/candor-the-art-of-gesticulation',
+ 'md5': '068bc0202558c2e391924cb8cc470676',
+ 'info_dict': {
+ 'id': '2520295746001',
+ 'ext': 'mp4',
+ 'title': 'Candor: The Art of Gesticulation',
+ 'description': 'Candor: The Art of Gesticulation',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Nowness',
},
- {
- 'url': 'http://cn.nowness.com/day/2014/8/7/4069/kasper-bj-rke-ft-jaakko-eino-kalevi--tnr',
- 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3',
- 'info_dict': {
- 'id': '3716354522001',
- 'ext': 'mp4',
- 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
- 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
- 'thumbnail': 're:^https?://.*\.jpg',
- 'uploader': 'Nowness',
- }
+ }, {
+ 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr',
+ 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3',
+ 'info_dict': {
+ 'id': '3716354522001',
+ 'ext': 'mp4',
+ 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
+ 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Nowness',
},
- ]
+ }, {
+ # vimeo
+ 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut',
+ 'md5': '9a5a6a8edf806407e411296ab6bc2a49',
+ 'info_dict': {
+ 'id': '130020913',
+ 'ext': 'mp4',
+ 'title': 'Bleu, Blanc, Rouge - A Godard Supercut',
+ 'description': 'md5:f0ea5f1857dffca02dbd37875d742cec',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20150607',
+ 'uploader': 'Cinema Sem Lei',
+ 'uploader_id': 'cinemasemlei',
+ },
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('slug')
+ _, post = self._api_request(url, 'post/getBySlug/%s')
+ return self._extract_url_result(post)
- webpage = self._download_webpage(url, video_id)
- player_url = self._search_regex(
- r'"([^"]+/content/issue-[0-9.]+.js)"', webpage, 'player URL')
- real_id = self._search_regex(
- r'\sdata-videoId="([0-9]+)"', webpage, 'internal video ID')
- player_code = self._download_webpage(
- player_url, video_id,
- note='Downloading player JavaScript',
- errnote='Player download failed')
- player_code = player_code.replace("'+d+'", real_id)
+class NownessPlaylistIE(NownessBaseIE):
+ IE_NAME = 'nowness:playlist'
+ _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/playlist/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.nowness.com/playlist/3286/i-guess-thats-why-they-call-it-the-blues',
+ 'info_dict': {
+ 'id': '3286',
+ },
+ 'playlist_mincount': 8,
+ }
- bc_url = BrightcoveIE._extract_brightcove_url(player_code)
- if bc_url is None:
- raise ExtractorError('Could not find player definition')
- return {
- '_type': 'url',
- 'url': bc_url,
- 'ie_key': 'Brightcove',
- }
+ def _real_extract(self, url):
+ playlist_id, playlist = self._api_request(url, 'post?PlaylistId=%s')
+ entries = [self._extract_url_result(item) for item in playlist['items']]
+ return self.playlist_result(entries, playlist_id)
+
+
+class NownessSeriesIE(NownessBaseIE):
+ IE_NAME = 'nowness:series'
+ _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/series/(?P<id>[^/]+?)(?:$|[?#])'
+ _TEST = {
+ 'url': 'https://www.nowness.com/series/60-seconds',
+ 'info_dict': {
+ 'id': '60',
+ 'title': '60 Seconds',
+ 'description': 'One-minute wisdom in a new NOWNESS series',
+ },
+ 'playlist_mincount': 4,
+ }
+
+ def _real_extract(self, url):
+ display_id, series = self._api_request(url, 'series/getBySlug/%s')
+ entries = [self._extract_url_result(post) for post in series['posts']]
+ series_title = None
+ series_description = None
+ translations = series.get('translations', [])
+ if translations:
+ series_title = translations[0].get('title') or translations[0]['seoTitle']
+ series_description = translations[0].get('seoDescription')
+ return self.playlist_result(
+ entries, compat_str(series['id']), series_title, series_description)
diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py
new file mode 100644
index 000000000..67e34b294
--- /dev/null
+++ b/youtube_dl/extractor/nowtv.py
@@ -0,0 +1,257 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+ parse_duration,
+ remove_start,
+)
+
+
+class NowTVBaseIE(InfoExtractor):
+ _VIDEO_FIELDS = (
+ 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort',
+ 'broadcastStartDate', 'seoUrl', 'duration', 'files',
+ 'format.defaultImage169Format', 'format.defaultImage169Logo')
+
+ def _extract_video(self, info, display_id=None):
+ video_id = compat_str(info['id'])
+
+ files = info['files']
+ if not files:
+ if info.get('geoblocked', False):
+ raise ExtractorError(
+ 'Video %s is not available from your location due to geo restriction' % video_id,
+ expected=True)
+ if not info.get('free', True):
+ raise ExtractorError(
+ 'Video %s is not available for free' % video_id, expected=True)
+
+ formats = []
+ for item in files['items']:
+ if determine_ext(item['path']) != 'f4v':
+ continue
+ app, play_path = remove_start(item['path'], '/').split('/', 1)
+ formats.append({
+ 'url': 'rtmpe://fms.rtl.de',
+ 'app': app,
+ 'play_path': 'mp4:%s' % play_path,
+ 'ext': 'flv',
+ 'page_url': 'http://rtlnow.rtl.de',
+ 'player_url': 'http://cdn.static-fra.de/now/vodplayer.swf',
+ 'tbr': int_or_none(item.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ title = info['title']
+ description = info.get('articleLong') or info.get('articleShort')
+ timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+ duration = parse_duration(info.get('duration'))
+
+ f = info.get('format', {})
+ thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id or info.get('seoUrl'),
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
+
+
+class NowTVIE(NowTVBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:list/[^/]+/)?(?P<id>[^/]+)/(?:player|preview)'
+
+ _TESTS = [{
+ # rtl
+ 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player',
+ 'info_dict': {
+ 'id': '203519',
+ 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+ 'ext': 'flv',
+ 'title': 'Inka Bause stellt die neuen Bauern vor',
+ 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432580700,
+ 'upload_date': '20150525',
+ 'duration': 2786,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # rtl2
+ 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player',
+ 'info_dict': {
+ 'id': '203481',
+ 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934',
+ 'ext': 'flv',
+ 'title': 'Berlin - Tag & Nacht (Folge 934)',
+ 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432666800,
+ 'upload_date': '20150526',
+ 'duration': 2641,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # rtlnitro
+ 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player',
+ 'info_dict': {
+ 'id': '165780',
+ 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00',
+ 'ext': 'flv',
+ 'title': 'Hals- und Beinbruch',
+ 'description': 'md5:b50d248efffe244e6f56737f0911ca57',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432415400,
+ 'upload_date': '20150523',
+ 'duration': 2742,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # superrtl
+ 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player',
+ 'info_dict': {
+ 'id': '99205',
+ 'display_id': 'medicopter-117/angst',
+ 'ext': 'flv',
+ 'title': 'Angst!',
+ 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1222632900,
+ 'upload_date': '20080928',
+ 'duration': 3025,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # ntv
+ 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player',
+ 'info_dict': {
+ 'id': '203521',
+ 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch',
+ 'ext': 'flv',
+ 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch',
+ 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432751700,
+ 'upload_date': '20150527',
+ 'duration': 1083,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # vox
+ 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player',
+ 'info_dict': {
+ 'id': '128953',
+ 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel',
+ 'ext': 'flv',
+ 'title': "Büro-Fall / Chihuahua 'Joel'",
+ 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1432408200,
+ 'upload_date': '20150523',
+ 'duration': 3092,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = '%s/%s' % (mobj.group('show_id'), mobj.group('id'))
+
+ info = self._download_json(
+ 'https://api.nowtv.de/v3/movies/%s?fields=%s'
+ % (display_id, ','.join(self._VIDEO_FIELDS)), display_id)
+
+ return self._extract_video(info, display_id)
+
+
+class NowTVListIE(NowTVBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/list/(?P<id>[^?/#&]+)$'
+
+ _SHOW_FIELDS = ('title', )
+ _SEASON_FIELDS = ('id', 'headline', 'seoheadline', )
+
+ _TESTS = [{
+ 'url': 'http://www.nowtv.at/rtl/stern-tv/list/aktuell',
+ 'info_dict': {
+ 'id': '17006',
+ 'title': 'stern TV - Aktuell',
+ },
+ 'playlist_count': 1,
+ }, {
+ 'url': 'http://www.nowtv.at/rtl/das-supertalent/list/free-staffel-8',
+ 'info_dict': {
+ 'id': '20716',
+ 'title': 'Das Supertalent - FREE Staffel 8',
+ },
+ 'playlist_count': 14,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ show_id = mobj.group('show_id')
+ season_id = mobj.group('id')
+
+ fields = []
+ fields.extend(self._SHOW_FIELDS)
+ fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS)
+ fields.extend(
+ 'formatTabs.formatTabPages.container.movies.%s' % field
+ for field in self._VIDEO_FIELDS)
+
+ list_info = self._download_json(
+ 'https://api.nowtv.de/v3/formats/seo?fields=%s&name=%s.php'
+ % (','.join(fields), show_id),
+ season_id)
+
+ season = next(
+ season for season in list_info['formatTabs']['items']
+ if season.get('seoheadline') == season_id)
+
+ title = '%s - %s' % (list_info['title'], season['headline'])
+
+ entries = []
+ for container in season['formatTabPages']['items']:
+ for info in ((container.get('container') or {}).get('movies') or {}).get('items') or []:
+ entries.append(self._extract_video(info))
+
+ return self.playlist_result(
+ entries, compat_str(season.get('id') or season_id), title)
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
index dec09cdfe..57ee3d366 100644
--- a/youtube_dl/extractor/nowvideo.py
+++ b/youtube_dl/extractor/nowvideo.py
@@ -7,9 +7,9 @@ class NowVideoIE(NovaMovIE):
IE_NAME = 'nowvideo'
IE_DESC = 'NowVideo'
- _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co|li)'}
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'}
- _HOST = 'www.nowvideo.ch'
+ _HOST = 'www.nowvideo.to'
_FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
_FILEKEY_REGEX = r'var fkzd="([^"]+)";'
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index c075618e8..eb12fb810 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
-from .subtitles import SubtitlesInfoExtractor
+import re
+
from .common import InfoExtractor
from ..utils import (
fix_xml_ampersands,
@@ -8,22 +9,50 @@ from ..utils import (
qualities,
strip_jsonp,
unified_strdate,
- url_basename,
)
-class NPOBaseIE(SubtitlesInfoExtractor):
+class NPOBaseIE(InfoExtractor):
def _get_token(self, video_id):
token_page = self._download_webpage(
'http://ida.omroep.nl/npoplayer/i.js',
video_id, note='Downloading token')
- return self._search_regex(
+ token = self._search_regex(
r'npoplayer\.token = "(.+?)"', token_page, 'token')
+ # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js
+ token_l = list(token)
+ first = second = None
+ for i in range(5, len(token_l) - 4):
+ if token_l[i].isdigit():
+ if first is None:
+ first = i
+ elif second is None:
+ second = i
+ if first is None or second is None:
+ first = 12
+ second = 13
+
+ token_l[first], token_l[second] = token_l[second], token_l[first]
+
+ return ''.join(token_l)
class NPOIE(NPOBaseIE):
- IE_NAME = 'npo.nl'
- _VALID_URL = r'https?://(?:www\.)?npo\.nl/(?!live|radio)[^/]+/[^/]+/(?P<id>[^/?]+)'
+ IE_NAME = 'npo'
+ IE_DESC = 'npo.nl and ntr.nl'
+ _VALID_URL = r'''(?x)
+ (?:
+ npo:|
+ https?://
+ (?:www\.)?
+ (?:
+ npo\.nl/(?!live|radio)(?:[^/]+/){2}|
+ ntr\.nl/(?:[^/]+/){2,}|
+ omroepwnl\.nl/video/fragment/[^/]+__
+ )
+ )
+ (?P<id>[^/?#]+)
+ '''
_TESTS = [
{
@@ -43,7 +72,7 @@ class NPOIE(NPOBaseIE):
'info_dict': {
'id': 'VARA_101191800',
'ext': 'm4v',
- 'title': 'De Mega Mike & Mega Thomas show',
+ 'title': 'De Mega Mike & Mega Thomas show: The best of.',
'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
'upload_date': '20090227',
'duration': 2400,
@@ -55,8 +84,8 @@ class NPOIE(NPOBaseIE):
'info_dict': {
'id': 'VPWON_1169289',
'ext': 'm4v',
- 'title': 'Tegenlicht',
- 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+ 'title': 'Tegenlicht: De toekomst komt uit Afrika',
+ 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
'upload_date': '20130225',
'duration': 3000,
},
@@ -85,6 +114,30 @@ class NPOIE(NPOBaseIE):
'title': 'Hoe gaat Europa verder na Parijs?',
},
},
+ {
+ 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
+ 'md5': '01c6a2841675995da1f0cf776f03a9c3',
+ 'info_dict': {
+ 'id': 'VPWON_1233944',
+ 'ext': 'm4v',
+ 'title': 'Aap, poot, pies',
+ 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
+ 'upload_date': '20150508',
+ 'duration': 599,
+ },
+ },
+ {
+ 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
+ 'md5': 'd30cd8417b8b9bca1fdff27428860d08',
+ 'info_dict': {
+ 'id': 'POW_00996502',
+ 'ext': 'm4v',
+ 'title': '''"Dit is wel een 'landslide'..."''',
+ 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
+ 'upload_date': '20150508',
+ 'duration': 462,
+ },
+ }
]
def _real_extract(self, url):
@@ -93,12 +146,24 @@ class NPOIE(NPOBaseIE):
def _get_info(self, video_id):
metadata = self._download_json(
- 'http://e.omroep.nl/metadata/aflevering/%s' % video_id,
+ 'http://e.omroep.nl/metadata/%s' % video_id,
video_id,
# We have to remove the javascript callback
transform_source=strip_jsonp,
)
+ # For some videos actual video id (prid) is different (e.g. for
+ # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698
+ # video id is POMS_WNL_853698 but prid is POW_00996502)
+ video_id = metadata.get('prid') or video_id
+
+ # titel is too generic in some cases so utilize aflevering_titel as well
+ # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html)
+ title = metadata['titel']
+ sub_title = metadata.get('aflevering_titel')
+ if sub_title and sub_title != title:
+ title += ': %s' % sub_title
+
token = self._get_token(video_id)
formats = []
@@ -164,18 +229,15 @@ class NPOIE(NPOBaseIE):
subtitles = {}
if metadata.get('tt888') == 'ja':
- subtitles['nl'] = 'http://e.omroep.nl/tt888/%s' % video_id
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
-
- subtitles = self.extract_subtitles(video_id, subtitles)
+ subtitles['nl'] = [{
+ 'ext': 'vtt',
+ 'url': 'http://e.omroep.nl/tt888/%s' % video_id,
+ }]
return {
'id': video_id,
- 'title': metadata['titel'],
- 'description': metadata['info'],
+ 'title': title,
+ 'description': metadata.get('info'),
'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
'upload_date': unified_strdate(metadata.get('gidsdatum')),
'duration': parse_duration(metadata.get('tijdsduur')),
@@ -223,7 +285,8 @@ class NPOLiveIE(NPOBaseIE):
if streams:
for stream in streams:
stream_type = stream.get('type').lower()
- if stream_type == 'ss':
+ # smooth streaming is not supported
+ if stream_type in ['ss', 'ms']:
continue
stream_info = self._download_json(
'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp'
@@ -234,7 +297,10 @@ class NPOLiveIE(NPOBaseIE):
stream_url = self._download_json(
stream_info['stream'], display_id,
'Downloading %s URL' % stream_type,
- transform_source=strip_jsonp)
+ 'Unable to download %s URL' % stream_type,
+ transform_source=strip_jsonp, fatal=False)
+ if not stream_url:
+ continue
if stream_type == 'hds':
f4m_formats = self._extract_f4m_formats(stream_url, display_id)
# f4m downloader downloads only piece of live stream
@@ -246,6 +312,7 @@ class NPOLiveIE(NPOBaseIE):
else:
formats.append({
'url': stream_url,
+ 'preference': -10,
})
self._sort_formats(formats)
@@ -339,9 +406,9 @@ class NPORadioFragmentIE(InfoExtractor):
}
-class TegenlichtVproIE(NPOIE):
- IE_NAME = 'tegenlicht.vpro.nl'
- _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?'
+class VPROIE(NPOIE):
+ IE_NAME = 'vpro'
+ _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html'
_TESTS = [
{
@@ -350,17 +417,72 @@ class TegenlichtVproIE(NPOIE):
'info_dict': {
'id': 'VPWON_1169289',
'ext': 'm4v',
- 'title': 'Tegenlicht',
- 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1',
+ 'title': 'De toekomst komt uit Afrika',
+ 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
'upload_date': '20130225',
},
},
+ {
+ 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html',
+ 'info_dict': {
+ 'id': 'sergio-herman',
+ 'title': 'Sergio Herman: Fucking perfect',
+ },
+ 'playlist_count': 2,
+ },
+ {
+ # playlist with youtube embed
+ 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html',
+ 'info_dict': {
+ 'id': 'education-education',
+ 'title': '2Doc',
+ },
+ 'playlist_count': 2,
+ }
]
def _real_extract(self, url):
- name = url_basename(url)
- webpage = self._download_webpage(url, name)
- urn = self._html_search_meta('mediaurn', webpage)
- info_page = self._download_json(
- 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name)
- return self._get_info(info_page['mid'])
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id)
+ for video_id in re.findall(r'data-media-id="([^"]+)"', webpage)
+ ]
+
+ playlist_title = self._search_regex(
+ r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*</title>',
+ webpage, 'playlist title', default=None) or self._og_search_title(webpage)
+
+ return self.playlist_result(entries, playlist_id, playlist_title)
+
+
+class WNLIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+'
+
+ _TEST = {
+ 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',
+ 'info_dict': {
+ 'id': 'vandaag-de-dag-6-mei',
+ 'title': 'Vandaag de Dag 6 mei',
+ },
+ 'playlist_count': 4,
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('npo:%s' % video_id, 'NPO')
+ for video_id, part in re.findall(
+ r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage)
+ ]
+
+ playlist_title = self._html_search_regex(
+ r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>',
+ webpage, 'playlist title')
+
+ return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index f6de26022..8ac38a174 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,56 +4,58 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
float_or_none,
parse_duration,
unified_strdate,
)
-from .subtitles import SubtitlesInfoExtractor
class NRKIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})'
+ _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
_TESTS = [
{
- 'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/',
- 'md5': 'a6eac35052f3b242bb6bb7f43aed5886',
+ 'url': 'http://www.nrk.no/video/PS*150533',
+ 'md5': 'bccd850baebefe23b56d708a113229c2',
'info_dict': {
'id': '150533',
'ext': 'flv',
'title': 'Dompap og andre fugler i Piip-Show',
- 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f'
+ 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
+ 'duration': 263,
}
},
{
- 'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/',
- 'md5': '3471f2a51718195164e88f46bf427668',
+ 'url': 'http://www.nrk.no/video/PS*154915',
+ 'md5': '0b1493ba1aae7d9579a5ad5531bc395a',
'info_dict': {
'id': '154915',
'ext': 'flv',
'title': 'Slik høres internett ut når du er blind',
'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+ 'duration': 20,
}
},
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage(url, video_id)
-
- video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id')
+ video_id = self._match_id(url)
data = self._download_json(
- 'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON')
+ 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id,
+ video_id, 'Downloading media JSON')
if data['usageRights']['isGeoBlocked']:
- raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True)
+ raise ExtractorError(
+ 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
+ expected=True)
+
+ video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81'
- video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124'
+ duration = parse_duration(data.get('duration'))
images = data.get('images')
if images:
@@ -69,16 +71,58 @@ class NRKIE(InfoExtractor):
'ext': 'flv',
'title': data['title'],
'description': data['description'],
+ 'duration': duration,
'thumbnail': thumbnail,
}
-class NRKTVIE(SubtitlesInfoExtractor):
- _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+class NRKPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+
+ _TESTS = [{
+ 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
+ 'info_dict': {
+ 'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763',
+ 'title': 'Gjenopplev den historiske solformørkelsen',
+ 'description': 'md5:c2df8ea3bac5654a26fc2834a542feed',
+ },
+ 'playlist_count': 2,
+ }, {
+ 'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449',
+ 'info_dict': {
+ 'id': 'rivertonprisen-til-karin-fossum-1.12266449',
+ 'title': 'Rivertonprisen til Karin Fossum',
+ 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
+ },
+ 'playlist_count': 5,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('nrk:%s' % video_id, 'NRK')
+ for video_id in re.findall(
+ r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"',
+ webpage)
+ ]
+
+ playlist_title = self._og_search_title(webpage)
+ playlist_description = self._og_search_description(webpage)
+
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
+
+class NRKTVIE(InfoExtractor):
+ IE_DESC = 'NRK TV and NRK Radio'
+ _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
_TESTS = [
{
- 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
'md5': 'adf2c5454fa2bf032f47a9f8fb351342',
'info_dict': {
'id': 'MUHH48000314',
@@ -90,7 +134,7 @@ class NRKTVIE(SubtitlesInfoExtractor):
},
},
{
- 'url': 'http://tv.nrk.no/program/mdfp15000514',
+ 'url': 'https://tv.nrk.no/program/mdfp15000514',
'md5': '383650ece2b25ecec996ad7b5bb2a384',
'info_dict': {
'id': 'mdfp15000514',
@@ -103,7 +147,7 @@ class NRKTVIE(SubtitlesInfoExtractor):
},
{
# single playlist video
- 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+ 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
'md5': 'adbd1dbd813edaf532b0a253780719c2',
'info_dict': {
'id': 'MSPO40010515-part2',
@@ -115,7 +159,7 @@ class NRKTVIE(SubtitlesInfoExtractor):
'skip': 'Only works from Norway',
},
{
- 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+ 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
'playlist': [
{
'md5': '9480285eff92d64f06e02a5367970a7a',
@@ -146,40 +190,22 @@ class NRKTVIE(SubtitlesInfoExtractor):
'duration': 6947.5199999999995,
},
'skip': 'Only works from Norway',
+ },
+ {
+ 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+ 'only_matching': True,
}
]
- def _seconds2str(self, s):
- return '%02d:%02d:%02d.%03d' % (s / 3600, (s % 3600) / 60, s % 60, (s % 1) * 1000)
-
- def _debug_print(self, txt):
- if self._downloader.params.get('verbose', False):
- self.to_screen('[debug] %s' % txt)
-
- def _extract_captions(self, subtitlesurl, video_id, baseurl):
- url = "%s%s" % (baseurl, subtitlesurl)
- self._debug_print('%s: Subtitle url: %s' % (video_id, url))
- captions = self._download_xml(url, video_id, 'Downloading subtitles')
- lang = captions.get('lang', 'no')
- ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}'))
- srt = ''
- for pos, p in enumerate(ps):
- begin = parse_duration(p.get('begin'))
- duration = parse_duration(p.get('dur'))
- starttime = self._seconds2str(begin)
- endtime = self._seconds2str(begin + duration)
- text = '\n'.join(p.itertext())
- srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), starttime, endtime, text)
- return {lang: srt}
-
def _extract_f4m(self, manifest_url, video_id):
- return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id)
+ return self._extract_f4m_formats(
+ manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
part_id = mobj.group('part_id')
- baseurl = mobj.group('baseurl')
+ base_url = mobj.group('baseurl')
webpage = self._download_webpage(url, video_id)
@@ -235,18 +261,18 @@ class NRKTVIE(SubtitlesInfoExtractor):
m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
if m3u8_url:
- formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4'))
+ formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
self._sort_formats(formats)
subtitles_url = self._html_search_regex(
- r'data-subtitlesurl[ ]*=[ ]*"([^"]+)"',
- webpage, 'subtitle URL', default=None)
- subtitles = None
+ r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1',
+ webpage, 'subtitle URL', default=None, group='url')
+ subtitles = {}
if subtitles_url:
- subtitles = self._extract_captions(subtitles_url, video_id, baseurl)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
+ subtitles['no'] = [{
+ 'ext': 'ttml',
+ 'url': compat_urlparse.urljoin(base_url, subtitles_url),
+ }]
return {
'id': video_id,
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py
index 57928f2ae..9fa7cefad 100644
--- a/youtube_dl/extractor/nuvid.py
+++ b/youtube_dl/extractor/nuvid.py
@@ -3,11 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
parse_duration,
+ sanitized_Request,
unified_strdate,
)
@@ -33,7 +31,7 @@ class NuvidIE(InfoExtractor):
formats = []
for dwnld_speed, format_id in [(0, '3gp'), (5, 'mp4')]:
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
'http://m.nuvid.com/play/%s' % video_id)
request.add_header('Cookie', 'skip_download_page=1; dwnld_speed=%d; adv_show=1' % dwnld_speed)
webpage = self._download_webpage(
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
index 56e1cad3b..7f254b867 100644
--- a/youtube_dl/extractor/nytimes.py
+++ b/youtube_dl/extractor/nytimes.py
@@ -1,39 +1,22 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import parse_iso8601
-
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+)
-class NYTimesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)'
-
- _TEST = {
- 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
- 'md5': '18a525a510f942ada2720db5f31644c0',
- 'info_dict': {
- 'id': '100000002847155',
- 'ext': 'mov',
- 'title': 'Verbatim: What Is a Photocopier?',
- 'description': 'md5:93603dada88ddbda9395632fdc5da260',
- 'timestamp': 1398631707,
- 'upload_date': '20140427',
- 'uploader': 'Brett Weiner',
- 'duration': 419,
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+class NYTimesBaseIE(InfoExtractor):
+ def _extract_video_from_id(self, video_id):
video_data = self._download_json(
- 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON')
+ 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id,
+ video_id, 'Downloading video JSON')
title = video_data['headline']
- description = video_data['summary']
- duration = video_data['duration'] / 1000.0
+ description = video_data.get('summary')
+ duration = float_or_none(video_data.get('duration'), 1000)
uploader = video_data['byline']
timestamp = parse_iso8601(video_data['publication_date'][:-8])
@@ -49,11 +32,11 @@ class NYTimesIE(InfoExtractor):
formats = [
{
'url': video['url'],
- 'format_id': video['type'],
- 'vcodec': video['video_codec'],
- 'width': video['width'],
- 'height': video['height'],
- 'filesize': get_file_size(video['fileSize']),
+ 'format_id': video.get('type'),
+ 'vcodec': video.get('video_codec'),
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'filesize': get_file_size(video.get('fileSize')),
} for video in video_data['renditions']
]
self._sort_formats(formats)
@@ -61,7 +44,8 @@ class NYTimesIE(InfoExtractor):
thumbnails = [
{
'url': 'http://www.nytimes.com/%s' % image['url'],
- 'resolution': '%dx%d' % (image['width'], image['height']),
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
} for image in video_data['images']
]
@@ -75,3 +59,59 @@ class NYTimesIE(InfoExtractor):
'formats': formats,
'thumbnails': thumbnails,
}
+
+
+class NYTimesIE(NYTimesBaseIE):
+ _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
+ 'md5': '18a525a510f942ada2720db5f31644c0',
+ 'info_dict': {
+ 'id': '100000002847155',
+ 'ext': 'mov',
+ 'title': 'Verbatim: What Is a Photocopier?',
+ 'description': 'md5:93603dada88ddbda9395632fdc5da260',
+ 'timestamp': 1398631707,
+ 'upload_date': '20140427',
+ 'uploader': 'Brett Weiner',
+ 'duration': 419,
+ }
+ }, {
+ 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ return self._extract_video_from_id(video_id)
+
+
+class NYTimesArticleIE(NYTimesBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
+ _TESTS = [{
+ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
+ 'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
+ 'info_dict': {
+ 'id': '100000003628438',
+ 'ext': 'mov',
+ 'title': 'New Minimum Wage: $70,000 a Year',
+ 'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
+ 'timestamp': 1429033037,
+ 'upload_date': '20150414',
+ 'uploader': 'Matthew Williams',
+ }
+ }, {
+ 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id')
+
+ return self._extract_video_from_id(video_id)
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
new file mode 100644
index 000000000..184c7a323
--- /dev/null
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -0,0 +1,152 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+ int_or_none,
+ qualities,
+ unescapeHTML,
+)
+
+
+class OdnoklassnikiIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)'
+ _TESTS = [{
+ # metadata in JSON
+ 'url': 'http://ok.ru/video/20079905452',
+ 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc',
+ 'info_dict': {
+ 'id': '20079905452',
+ 'ext': 'mp4',
+ 'title': 'Культура меняет нас (прекрасный ролик!))',
+ 'duration': 100,
+ 'upload_date': '20141207',
+ 'uploader_id': '330537914540',
+ 'uploader': 'Виталий Добровольский',
+ 'like_count': int,
+ 'age_limit': 0,
+ },
+ 'skip': 'Video has been blocked',
+ }, {
+ # metadataUrl
+ 'url': 'http://ok.ru/video/63567059965189-0',
+ 'md5': '9676cf86eff5391d35dea675d224e131',
+ 'info_dict': {
+ 'id': '63567059965189-0',
+ 'ext': 'mp4',
+ 'title': 'Девушка без комплексов ...',
+ 'duration': 191,
+ 'upload_date': '20150518',
+ 'uploader_id': '534380003155',
+ 'uploader': '☭ Андрей Мещанинов ☭',
+ 'like_count': int,
+ 'age_limit': 0,
+ },
+ }, {
+ # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
+ 'url': 'http://ok.ru/video/64211978996595-1',
+ 'md5': '5d7475d428845cd2e13bae6f1a992278',
+ 'info_dict': {
+ 'id': '64211978996595-1',
+ 'ext': 'mp4',
+ 'title': 'Космическая среда от 26 августа 2015',
+ 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',
+ 'duration': 440,
+ 'upload_date': '20150826',
+ 'uploader_id': '750099571',
+ 'uploader': 'Алина П',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ok.ru/video/20648036891',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.ok.ru/videoembed/20648036891',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://ok.ru/video/%s' % video_id, video_id)
+
+ error = self._search_regex(
+ r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
+ webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
+
+ player = self._parse_json(
+ unescapeHTML(self._search_regex(
+ r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
+ webpage, 'player', group='player')),
+ video_id)
+
+ flashvars = player['flashvars']
+
+ metadata = flashvars.get('metadata')
+ if metadata:
+ metadata = self._parse_json(metadata, video_id)
+ else:
+ metadata = self._download_json(
+ compat_urllib_parse_unquote(flashvars['metadataUrl']),
+ video_id, 'Downloading metadata JSON')
+
+ movie = metadata['movie']
+ title = movie['title']
+ thumbnail = movie.get('poster')
+ duration = int_or_none(movie.get('duration'))
+
+ author = metadata.get('author', {})
+ uploader_id = author.get('id')
+ uploader = author.get('name')
+
+ upload_date = unified_strdate(self._html_search_meta(
+ 'ya:ovs:upload_date', webpage, 'upload date', default=None))
+
+ age_limit = None
+ adult = self._html_search_meta(
+ 'ya:ovs:adult', webpage, 'age limit', default=None)
+ if adult:
+ age_limit = 18 if adult == 'true' else 0
+
+ like_count = int_or_none(metadata.get('likeCount'))
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'like_count': like_count,
+ 'age_limit': age_limit,
+ }
+
+ if metadata.get('provider') == 'USER_YOUTUBE':
+ info.update({
+ '_type': 'url_transparent',
+ 'url': movie['contentId'],
+ })
+ return info
+
+ quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd'))
+
+ formats = [{
+ 'url': f['url'],
+ 'ext': 'mp4',
+ 'format_id': f['name'],
+ 'quality': quality(f['name']),
+ } for f in metadata['videos']]
+ self._sort_formats(formats)
+
+ info['formats'] = formats
+ return info
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py
new file mode 100644
index 000000000..0f1f448fe
--- /dev/null
+++ b/youtube_dl/extractor/onionstudios.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class OnionStudiosIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:videos/[^/]+-|embed\?.*\bid=)(?P<id>\d+)(?!-)'
+
+ _TESTS = [{
+ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
+ 'md5': 'd4851405d31adfadf71cd7a487b765bb',
+ 'info_dict': {
+ 'id': '2937',
+ 'ext': 'mp4',
+ 'title': 'Hannibal charges forward, stops for a cocktail',
+ 'description': 'md5:545299bda6abf87e5ec666548c6a9448',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'The A.V. Club',
+ 'uploader_id': 'TheAVClub',
+ },
+ }, {
+ 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/embed.+?)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id)
+
+ formats = []
+ for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
+ if determine_ext(src) != 'm3u8': # m3u8 always results in 403
+ formats.append({
+ 'url': src,
+ })
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
+ webpage, 'title', group='title')
+ description = self._search_regex(
+ r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1',
+ webpage, 'description', default=None, group='description')
+ thumbnail = self._search_regex(
+ r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
+ webpage, 'thumbnail', default=False, group='thumbnail')
+
+ uploader_id = self._search_regex(
+ r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1',
+ webpage, 'uploader id', fatal=False, group='uploader_id')
+ uploader = self._search_regex(
+ r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1',
+ webpage, 'uploader', default=False, group='uploader')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index d5b05c18f..a262a9f6d 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -1,63 +1,41 @@
from __future__ import unicode_literals
import re
import json
+import base64
from .common import InfoExtractor
from ..utils import (
unescapeHTML,
ExtractorError,
+ determine_ext,
+ int_or_none,
)
-class OoyalaIE(InfoExtractor):
- _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
-
- _TESTS = [
- {
- # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
- 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'info_dict': {
- 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- 'ext': 'mp4',
- 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
- 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
- },
- }, {
- # Only available for ipad
- 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'info_dict': {
- 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
- 'ext': 'mp4',
- 'title': 'Simulation Overview - Levels of Simulation',
- 'description': '',
- },
- },
- ]
+class OoyalaBaseIE(InfoExtractor):
- @staticmethod
- def _url_for_embed_code(embed_code):
- return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+ def _extract_result(self, info, more_info):
+ embedCode = info['embedCode']
+ video_url = info.get('ipad_url') or info['url']
- @classmethod
- def _build_url_result(cls, embed_code):
- return cls.url_result(cls._url_for_embed_code(embed_code),
- ie=cls.ie_key())
+ if determine_ext(video_url) == 'm3u8':
+ formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4')
+ else:
+ formats = [{
+ 'url': video_url,
+ 'ext': 'mp4',
+ }]
- def _extract_result(self, info, more_info):
return {
- 'id': info['embedCode'],
- 'ext': 'mp4',
+ 'id': embedCode,
'title': unescapeHTML(info['title']),
- 'url': info.get('ipad_url') or info['url'],
+ 'formats': formats,
'description': unescapeHTML(more_info['description']),
'thumbnail': more_info['promo'],
}
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- embedCode = mobj.group('id')
- player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
- player = self._download_webpage(player_url, embedCode)
+ def _extract(self, player_url, video_id):
+ player = self._download_webpage(player_url, video_id)
mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
player, 'mobile player url')
# Looks like some videos are only available for particular devices
@@ -70,13 +48,43 @@ class OoyalaIE(InfoExtractor):
devices.insert(0, 'unknown')
for device in devices:
mobile_player = self._download_webpage(
- '%s&device=%s' % (mobile_url, device), embedCode,
+ '%s&device=%s' % (mobile_url, device), video_id,
'Downloading mobile player JS for %s device' % device)
videos_info = self._search_regex(
r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
mobile_player, 'info', fatal=False, default=None)
if videos_info:
break
+
+ if not videos_info:
+ formats = []
+ auth_data = self._download_json(
+ 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id),
+ video_id)
+
+ cur_auth_data = auth_data['authorization_data'][video_id]
+
+ for stream in cur_auth_data['streams']:
+ formats.append({
+ 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'),
+ 'ext': stream.get('delivery_type'),
+ 'format': stream.get('video_codec'),
+ 'format_id': stream.get('profile'),
+ 'width': int_or_none(stream.get('width')),
+ 'height': int_or_none(stream.get('height')),
+ 'abr': int_or_none(stream.get('audio_bitrate')),
+ 'vbr': int_or_none(stream.get('video_bitrate')),
+ })
+ if formats:
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': 'Ooyala video',
+ }
+
+ if not cur_auth_data['authorized']:
+ raise ExtractorError(cur_auth_data['message'], expected=True)
+
if not videos_info:
raise ExtractorError('Unable to extract info')
videos_info = videos_info.replace('\\"', '"')
@@ -89,9 +97,100 @@ class OoyalaIE(InfoExtractor):
videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
return {
'_type': 'playlist',
- 'id': embedCode,
+ 'id': video_id,
'title': unescapeHTML(videos_more_info['title']),
'entries': videos,
}
else:
return self._extract_result(videos_info[0], videos_more_info)
+
+
+class OoyalaIE(OoyalaBaseIE):
+ _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
+
+ _TESTS = [
+ {
+ # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
+ 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'info_dict': {
+ 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'ext': 'mp4',
+ 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+ 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+ },
+ }, {
+ # Only available for ipad
+ 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+ 'info_dict': {
+ 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+ 'ext': 'mp4',
+ 'title': 'Simulation Overview - Levels of Simulation',
+ 'description': '',
+ },
+ },
+ {
+ # Information available only through SAS api
+ # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
+ 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+ 'md5': 'a84001441b35ea492bc03736e59e7935',
+ 'info_dict': {
+ 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+ 'ext': 'mp4',
+ 'title': 'Ooyala video',
+ }
+ }
+ ]
+
+ @staticmethod
+ def _url_for_embed_code(embed_code):
+ return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+
+ @classmethod
+ def _build_url_result(cls, embed_code):
+ return cls.url_result(cls._url_for_embed_code(embed_code),
+ ie=cls.ie_key())
+
+ def _real_extract(self, url):
+ embed_code = self._match_id(url)
+ player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+ return self._extract(player_url, embed_code)
+
+
+class OoyalaExternalIE(OoyalaBaseIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ ooyalaexternal:|
+ https?://.+?\.ooyala\.com/.*?\bexternalId=
+ )
+ (?P<partner_id>[^:]+)
+ :
+ (?P<id>.+)
+ (?:
+ :|
+ .*?&pcode=
+ )
+ (?P<pcode>.+?)
+ (&|$)
+ '''
+
+ _TEST = {
+ 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always',
+ 'info_dict': {
+ 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+ 'ext': 'mp4',
+ 'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+ 'description': '',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ partner_id = mobj.group('partner_id')
+ video_id = mobj.group('id')
+ pcode = mobj.group('pcode')
+ player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode)
+ return self._extract(player_url, video_id)
diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py
deleted file mode 100644
index 2249657eb..000000000
--- a/youtube_dl/extractor/openfilm.py
+++ /dev/null
@@ -1,70 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-
-from .common import InfoExtractor
-from ..utils import (
- parse_iso8601,
- compat_urllib_parse,
- parse_age_limit,
- int_or_none,
-)
-
-
-class OpenFilmIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P<id>.+)'
- _TEST = {
- 'url': 'http://www.openfilm.com/videos/human-resources-remastered',
- 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37',
- 'info_dict': {
- 'id': '32736',
- 'display_id': 'human-resources-remastered',
- 'ext': 'mp4',
- 'title': 'Human Resources (Remastered)',
- 'description': 'Social Engineering in the 20th Century.',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'duration': 7164,
- 'timestamp': 1334756988,
- 'upload_date': '20120418',
- 'uploader_id': '41117',
- 'view_count': int,
- 'age_limit': 0,
- },
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(url, display_id)
-
- player = compat_urllib_parse.unquote_plus(
- self._og_search_video_url(webpage))
-
- video = json.loads(self._search_regex(
- r'\bp=({.+?})(?:&|$)', player, 'video JSON'))
-
- video_url = '%s1.mp4' % video['location']
- video_id = video.get('video_id')
- display_id = video.get('alias') or display_id
- title = video.get('title')
- description = video.get('description')
- thumbnail = video.get('main_thumb')
- duration = int_or_none(video.get('duration'))
- timestamp = parse_iso8601(video.get('dt_published'), ' ')
- uploader_id = video.get('user_id')
- view_count = int_or_none(video.get('views_count'))
- age_limit = parse_age_limit(video.get('age_limit'))
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'url': video_url,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'timestamp': timestamp,
- 'uploader_id': uploader_id,
- 'view_count': view_count,
- 'age_limit': age_limit,
- }
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 4e293392b..2e6c9872b 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -11,6 +11,11 @@ from ..utils import (
HEADRequest,
unified_strdate,
ExtractorError,
+ strip_jsonp,
+ int_or_none,
+ float_or_none,
+ determine_ext,
+ remove_end,
)
@@ -197,3 +202,92 @@ class ORFFM4IE(InfoExtractor):
'description': data['subtitle'],
'entries': entries
}
+
+
+class ORFIPTVIE(InfoExtractor):
+ IE_NAME = 'orf:iptv'
+ IE_DESC = 'iptv.ORF.at'
+ _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://iptv.orf.at/stories/2275236/',
+ 'md5': 'c8b22af4718a4b4af58342529453e3e5',
+ 'info_dict': {
+ 'id': '350612',
+ 'ext': 'flv',
+ 'title': 'Weitere Evakuierungen um Vulkan Calbuco',
+ 'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
+ 'duration': 68.197,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20150425',
+ },
+ }
+
+ def _real_extract(self, url):
+ story_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://iptv.orf.at/stories/%s' % story_id, story_id)
+
+ video_id = self._search_regex(
+ r'data-video(?:id)?="(\d+)"', webpage, 'video id')
+
+ data = self._download_json(
+ 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
+ video_id)[0]
+
+ duration = float_or_none(data['duration'], 1000)
+
+ video = data['sources']['default']
+ load_balancer_url = video['loadBalancerUrl']
+ abr = int_or_none(video.get('audioBitrate'))
+ vbr = int_or_none(video.get('bitrate'))
+ fps = int_or_none(video.get('videoFps'))
+ width = int_or_none(video.get('videoWidth'))
+ height = int_or_none(video.get('videoHeight'))
+ thumbnail = video.get('preview')
+
+ rendition = self._download_json(
+ load_balancer_url, video_id, transform_source=strip_jsonp)
+
+ f = {
+ 'abr': abr,
+ 'vbr': vbr,
+ 'fps': fps,
+ 'width': width,
+ 'height': height,
+ }
+
+ formats = []
+ for format_id, format_url in rendition['redirect'].items():
+ if format_id == 'rtmp':
+ ff = f.copy()
+ ff.update({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ formats.append(ff)
+ elif determine_ext(format_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ format_url, video_id, f4m_id=format_id))
+ elif determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', m3u8_id=format_id))
+ else:
+ continue
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
+ description = self._og_search_description(webpage)
+ upload_date = unified_strdate(self._html_search_meta(
+ 'dc.date', webpage, 'upload date'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py
index 5429592a7..ec8876c28 100644
--- a/youtube_dl/extractor/patreon.py
+++ b/youtube_dl/extractor/patreon.py
@@ -1,17 +1,12 @@
# encoding: utf-8
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
-from ..utils import (
- js_to_json,
-)
+from ..utils import js_to_json
class PatreonIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)'
+ _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P<id>[^&#]+)'
_TESTS = [
{
'url': 'http://www.patreon.com/creation?hid=743933',
@@ -35,6 +30,23 @@ class PatreonIE(InfoExtractor):
'thumbnail': 're:^https?://.*$',
},
},
+ {
+ 'url': 'https://www.patreon.com/creation?hid=1682498',
+ 'info_dict': {
+ 'id': 'SU4fj_aEMVw',
+ 'ext': 'mp4',
+ 'title': 'I\'m on Patreon!',
+ 'uploader': 'TraciJHines',
+ 'thumbnail': 're:^https?://.*$',
+ 'upload_date': '20150211',
+ 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4',
+ 'uploader_id': 'TraciJHines',
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': True,
+ }
+ }
]
# Currently Patreon exposes download URL via hidden CSS, so login is not
@@ -51,7 +63,7 @@ class PatreonIE(InfoExtractor):
'password': password,
}
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
'https://www.patreon.com/processLogin',
compat_urllib_parse.urlencode(login_form).encode('utf-8')
)
@@ -65,26 +77,29 @@ class PatreonIE(InfoExtractor):
'''
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
-
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage).strip()
attach_fn = self._html_search_regex(
r'<div class="attach"><a target="_blank" href="([^"]+)">',
webpage, 'attachment URL', default=None)
+ embed = self._html_search_regex(
+ r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"',
+ webpage, 'embedded URL', default=None)
+
if attach_fn is not None:
video_url = 'http://www.patreon.com' + attach_fn
thumbnail = self._og_search_thumbnail(webpage)
uploader = self._html_search_regex(
r'<strong>(.*?)</strong> is creating', webpage, 'uploader')
+ elif embed is not None:
+ return self.url_result(embed)
else:
- playlist_js = self._search_regex(
+ playlist = self._parse_json(self._search_regex(
r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])',
- webpage, 'playlist JSON')
- playlist_json = js_to_json(playlist_js)
- playlist = json.loads(playlist_json)
+ webpage, 'playlist JSON'),
+ video_id, transform_source=js_to_json)
data = playlist[0]
video_url = self._proto_relative_url(data['mp3'])
thumbnail = self._proto_relative_url(data.get('cover'))
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index afce732e1..b787e2a73 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -5,6 +6,9 @@ import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ determine_ext,
+ int_or_none,
+ strip_jsonp,
unified_strdate,
US_RATINGS,
)
@@ -18,7 +22,7 @@ class PBSIE(InfoExtractor):
# Article with embedded player (or direct video)
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
- video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
+ (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
'''
@@ -29,10 +33,13 @@ class PBSIE(InfoExtractor):
'info_dict': {
'id': '2365006249',
'ext': 'mp4',
- 'title': 'A More Perfect Union',
+ 'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
'description': 'md5:ba0c207295339c8d6eced00b7c363c6a',
'duration': 3190,
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
@@ -40,10 +47,13 @@ class PBSIE(InfoExtractor):
'info_dict': {
'id': '2365297690',
'ext': 'mp4',
- 'title': 'Losing Iraq',
+ 'title': 'FRONTLINE - Losing Iraq',
'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
'duration': 5050,
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ }
},
{
'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@@ -51,7 +61,7 @@ class PBSIE(InfoExtractor):
'info_dict': {
'id': '2201174722',
'ext': 'mp4',
- 'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist',
+ 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28',
'duration': 801,
},
@@ -63,10 +73,13 @@ class PBSIE(InfoExtractor):
'id': '2365297708',
'ext': 'mp4',
'description': 'md5:68d87ef760660eb564455eb30ca464fe',
- 'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
+ 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$',
- }
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@@ -76,11 +89,15 @@ class PBSIE(InfoExtractor):
'display_id': 'killer-typhoon',
'ext': 'mp4',
'description': 'md5:c741d14e979fc53228c575894094f157',
- 'title': 'Killer Typhoon',
+ 'title': 'NOVA - Killer Typhoon',
'duration': 3172,
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20140122',
- }
+ 'age_limit': 10,
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@@ -88,8 +105,83 @@ class PBSIE(InfoExtractor):
'id': 'united-states-of-secrets',
},
'playlist_count': 2,
+ },
+ {
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
+ 'info_dict': {
+ 'id': '2276541483',
+ 'display_id': 'player',
+ 'ext': 'mp4',
+ 'title': 'American Experience - Death and the Civil War, Chapter 1',
+ 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.',
+ 'duration': 682,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ },
+ {
+ 'url': 'http://video.pbs.org/video/2365367186/',
+ 'info_dict': {
+ 'id': '2365367186',
+ 'display_id': '2365367186',
+ 'ext': 'mp4',
+ 'title': 'To Catch A Comet - Full Episode',
+ 'description': 'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.',
+ 'duration': 3342,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ 'skip': 'Expired',
+ },
+ {
+ # Video embedded in iframe containing angle brackets as attribute's value (e.g.
+ # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
+ # https://github.com/rg3/youtube-dl/issues/7059)
+ 'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
+ 'info_dict': {
+ 'id': '2365546844',
+ 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
+ 'ext': 'mp4',
+ 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
+ 'description': 'md5:61db2ddf27c9912f09c241014b118ed1',
+ 'duration': 1480,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ },
+ {
+ # Frontline video embedded via flp2012.js
+ 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists',
+ 'info_dict': {
+ 'id': '2070868960',
+ 'display_id': 'the-atomic-artists',
+ 'ext': 'mp4',
+ 'title': 'FRONTLINE - The Atomic Artists',
+ 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+ 'duration': 723,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ },
+ {
+ 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
+ 'only_matching': True,
}
]
+ _ERRORS = {
+ 101: 'We\'re sorry, but this video is not yet available.',
+ 403: 'We\'re sorry, but this video is not available in your region due to right restrictions.',
+ 404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.',
+ 410: 'This video has expired and is no longer available for online streaming.',
+ }
def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -120,9 +212,30 @@ class PBSIE(InfoExtractor):
if media_id:
return media_id, presumptive_id, upload_date
- url = self._search_regex(
- r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>',
- webpage, 'player URL')
+ # Fronline video embedded via flp
+ video_id = self._search_regex(
+ r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None)
+ if video_id:
+ # pkg_id calculation is reverse engineered from
+ # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js
+ prg_id = self._search_regex(
+ r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:]
+ if 'q' in prg_id:
+ prg_id = prg_id.split('q')[1]
+ prg_id = int(prg_id, 16)
+ getdir = self._download_json(
+ 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
+ presumptive_id, 'Downloading getdir JSON',
+ transform_source=strip_jsonp)
+ return getdir['mid'], presumptive_id, upload_date
+
+ for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
+ url = self._search_regex(
+ r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe,
+ 'player URL', default=None, group='url')
+ if url:
+ break
+
mobj = re.match(self._VALID_URL, url)
player_id = mobj.group('player_id')
@@ -149,36 +262,72 @@ class PBSIE(InfoExtractor):
for vid_id in video_id]
return self.playlist_result(entries, display_id)
- info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
- info = self._download_json(info_url, display_id)
-
- redirect_url = info['alternate_encoding']['url']
- redirect_info = self._download_json(
- redirect_url + '?format=json', display_id,
- 'Downloading video url info')
- if redirect_info['status'] == 'error':
- if redirect_info['http_code'] == 403:
- message = (
- 'The video is not available in your region due to '
- 'right restrictions')
+ info = self._download_json(
+ 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
+ display_id)
+
+ formats = []
+ for encoding_name in ('recommended_encoding', 'alternate_encoding'):
+ redirect = info.get(encoding_name)
+ if not redirect:
+ continue
+ redirect_url = redirect.get('url')
+ if not redirect_url:
+ continue
+
+ redirect_info = self._download_json(
+ redirect_url + '?format=json', display_id,
+ 'Downloading %s video url info' % encoding_name)
+
+ if redirect_info['status'] == 'error':
+ raise ExtractorError(
+ '%s said: %s' % (
+ self.IE_NAME,
+ self._ERRORS.get(redirect_info['http_code'], redirect_info['message'])),
+ expected=True)
+
+ format_url = redirect_info.get('url')
+ if not format_url:
+ continue
+
+ if determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, display_id, 'mp4', preference=1, m3u8_id='hls'))
else:
- message = redirect_info['message']
- raise ExtractorError(message, expected=True)
+ formats.append({
+ 'url': format_url,
+ 'format_id': redirect.get('eeid'),
+ })
+ self._sort_formats(formats)
rating_str = info.get('rating')
if rating_str is not None:
rating_str = rating_str.rpartition('-')[2]
age_limit = US_RATINGS.get(rating_str)
+ subtitles = {}
+ closed_captions_url = info.get('closed_captions_url')
+ if closed_captions_url:
+ subtitles['en'] = [{
+ 'ext': 'ttml',
+ 'url': closed_captions_url,
+ }]
+
+ # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
+ # Try turning it to 'program - title' naming scheme if possible
+ alt_title = info.get('program', {}).get('title')
+ if alt_title:
+ info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
+
return {
'id': video_id,
'display_id': display_id,
'title': info['title'],
- 'url': redirect_info['url'],
- 'ext': 'mp4',
'description': info['program'].get('description'),
'thumbnail': info.get('image_url'),
- 'duration': info.get('duration'),
+ 'duration': int_or_none(info.get('duration')),
'age_limit': age_limit,
'upload_date': upload_date,
+ 'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
new file mode 100644
index 000000000..63cc764bb
--- /dev/null
+++ b/youtube_dl/extractor/periscope.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class PeriscopeIE(InfoExtractor):
+ IE_DESC = 'Periscope'
+ _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
+ # Alive example URLs can be found here http://onperiscope.com/
+ _TESTS = [{
+ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
+ 'md5': '65b57957972e503fcbbaeed8f4fa04ca',
+ 'info_dict': {
+ 'id': '56102209',
+ 'ext': 'mp4',
+ 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗',
+ 'timestamp': 1438978559,
+ 'upload_date': '20150807',
+ 'uploader': 'Bec Boop',
+ 'uploader_id': '1465763',
+ },
+ 'skip': 'Expires in 24 hours',
+ }, {
+ 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX',
+ 'only_matching': True,
+ }]
+
+ def _call_api(self, method, value):
+ attribute = 'token' if len(value) > 13 else 'broadcast_id'
+ return self._download_json(
+ 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value)
+
+ def _real_extract(self, url):
+ token = self._match_id(url)
+
+ broadcast_data = self._call_api('getBroadcastPublic', token)
+ broadcast = broadcast_data['broadcast']
+ status = broadcast['status']
+
+ uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name')
+ uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id')
+
+ title = '%s - %s' % (uploader, status) if uploader else status
+ state = broadcast.get('state').lower()
+ if state == 'running':
+ title = self._live_title(title)
+ timestamp = parse_iso8601(broadcast.get('created_at'))
+
+ thumbnails = [{
+ 'url': broadcast[image],
+ } for image in ('image_url', 'image_url_small') if broadcast.get(image)]
+
+ stream = self._call_api('getAccessPublic', token)
+
+ formats = []
+ for format_id in ('replay', 'rtmp', 'hls', 'https_hls'):
+ video_url = stream.get(format_id + '_url')
+ if not video_url:
+ continue
+ f = {
+ 'url': video_url,
+ 'ext': 'flv' if format_id == 'rtmp' else 'mp4',
+ }
+ if format_id != 'rtmp':
+ f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8'
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': broadcast.get('id') or token,
+ 'title': title,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py
new file mode 100644
index 000000000..6e60e5fe9
--- /dev/null
+++ b/youtube_dl/extractor/philharmoniedeparis.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+ xpath_text,
+)
+
+
+class PhilharmonieDeParisIE(InfoExtractor):
+ IE_DESC = 'Philharmonie de Paris'
+ _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html',
+ 'info_dict': {
+ 'id': '1032066',
+ 'ext': 'flv',
+ 'title': 'md5:d1f5585d87d041d07ce9434804bc8425',
+ 'timestamp': 1428179400,
+ 'upload_date': '20150404',
+ 'duration': 6592.278,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ concert = self._download_xml(
+ 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=%s' % video_id,
+ video_id).find('./concert')
+
+ formats = []
+ info_dict = {
+ 'id': video_id,
+ 'title': xpath_text(concert, './titre', 'title', fatal=True),
+ 'formats': formats,
+ }
+
+ fichiers = concert.find('./fichiers')
+ stream = fichiers.attrib['serveurstream']
+ for fichier in fichiers.findall('./fichier'):
+ info_dict['duration'] = float_or_none(fichier.get('timecodefin'))
+ for quality, (format_id, suffix) in enumerate([('lq', ''), ('hq', '_hd')]):
+ format_url = fichier.get('url%s' % suffix)
+ if not format_url:
+ continue
+ formats.append({
+ 'url': stream,
+ 'play_path': format_url,
+ 'ext': 'flv',
+ 'format_id': format_id,
+ 'width': int_or_none(concert.get('largeur%s' % suffix)),
+ 'height': int_or_none(concert.get('hauteur%s' % suffix)),
+ 'quality': quality,
+ })
+ self._sort_formats(formats)
+
+ date, hour = concert.get('date'), concert.get('heure')
+ if date and hour:
+ info_dict['timestamp'] = parse_iso8601(
+ '%s-%s-%sT%s:00' % (date[0:4], date[4:6], date[6:8], hour))
+ elif date:
+ info_dict['upload_date'] = date
+
+ return info_dict
diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py
index a20672c0c..46cebc0d7 100644
--- a/youtube_dl/extractor/phoenix.py
+++ b/youtube_dl/extractor/phoenix.py
@@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url
class PhoenixIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.phoenix.de/content/884301',
- 'md5': 'ed249f045256150c92e72dbb70eadec6',
- 'info_dict': {
- 'id': '884301',
- 'ext': 'mp4',
- 'title': 'Michael Krons mit Hans-Werner Sinn',
- 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
- 'upload_date': '20141025',
- 'uploader': 'Im Dialog',
- }
- }
+ _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/
+ (?:
+ phoenix/die_sendungen/(?:[^/]+/)?
+ )?
+ (?P<id>[0-9]+)'''
+ _TESTS = [
+ {
+ 'url': 'http://www.phoenix.de/content/884301',
+ 'md5': 'ed249f045256150c92e72dbb70eadec6',
+ 'info_dict': {
+ 'id': '884301',
+ 'ext': 'mp4',
+ 'title': 'Michael Krons mit Hans-Werner Sinn',
+ 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
+ 'upload_date': '20141025',
+ 'uploader': 'Im Dialog',
+ }
+ },
+ {
+ 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234',
+ 'only_matching': True,
+ },
+ ]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py
index c66db3cdc..788411ccc 100644
--- a/youtube_dl/extractor/photobucket.py
+++ b/youtube_dl/extractor/photobucket.py
@@ -4,7 +4,7 @@ import json
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote
class PhotobucketIE(InfoExtractor):
@@ -34,7 +34,7 @@ class PhotobucketIE(InfoExtractor):
info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
webpage, 'info json')
info = json.loads(info_json)
- url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
+ url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
return {
'id': video_id,
'url': url,
diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py
new file mode 100644
index 000000000..a52210fab
--- /dev/null
+++ b/youtube_dl/extractor/pinkbike.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ remove_end,
+ remove_start,
+ str_to_int,
+ unified_strdate,
+)
+
+
+class PinkbikeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.pinkbike.com/video/402811/',
+ 'md5': '4814b8ca7651034cd87e3361d5c2155a',
+ 'info_dict': {
+ 'id': '402811',
+ 'ext': 'mp4',
+ 'title': 'Brandon Semenuk - RAW 100',
+ 'description': 'Official release: www.redbull.ca/rupertwalker',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 100,
+ 'upload_date': '20150406',
+ 'uploader': 'revelco',
+ 'location': 'Victoria, British Columbia, Canada',
+ 'view_count': int,
+ 'comment_count': int,
+ }
+ }, {
+ 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.pinkbike.com/video/%s' % video_id, video_id)
+
+ formats = []
+ for _, format_id, src in re.findall(
+ r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ formats.append({
+ 'url': src,
+ 'format_id': format_id,
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
+ description = self._html_search_regex(
+ r'(?s)id="media-description"[^>]*>(.+?)<',
+ webpage, 'description', default=None) or remove_start(
+ self._og_search_description(webpage), title + '. ')
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration'))
+
+ uploader = self._search_regex(
+ r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
+ upload_date = unified_strdate(self._search_regex(
+ r'class="fullTime"[^>]+title="([^"]+)"',
+ webpage, 'upload date', fatal=False))
+
+ location = self._html_search_regex(
+ r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
+ webpage, 'location', fatal=False)
+
+ def extract_count(webpage, label):
+ return str_to_int(self._search_regex(
+ r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
+ webpage, label, fatal=False))
+
+ view_count = extract_count(webpage, 'Views')
+ comment_count = extract_count(webpage, 'Comments')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'location': location,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py
new file mode 100644
index 000000000..551c8c9f0
--- /dev/null
+++ b/youtube_dl/extractor/pladform.py
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ xpath_text,
+ qualities,
+)
+
+
+class PladformIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:
+ out\.pladform\.ru/player|
+ static\.pladform\.ru/player\.swf
+ )
+ \?.*\bvideoid=|
+ video\.pladform\.ru/catalog/video/videoid/
+ )
+ (?P<id>\d+)
+ '''
+ _TESTS = [{
+ # http://muz-tv.ru/kinozal/view/7400/
+ 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293',
+ 'md5': '61f37b575dd27f1bb2e1854777fe31f4',
+ 'info_dict': {
+ 'id': '100183293',
+ 'ext': 'mp4',
+ 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
+ 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 694,
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_xml(
+ 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id,
+ video_id)
+
+ if video.tag == 'error':
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, video.text),
+ expected=True)
+
+ quality = qualities(('ld', 'sd', 'hd'))
+
+ formats = [{
+ 'url': src.text,
+ 'format_id': src.get('quality'),
+ 'quality': quality(src.get('quality')),
+ } for src in video.findall('./src')]
+ self._sort_formats(formats)
+
+ webpage = self._download_webpage(
+ 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id,
+ video_id)
+
+ title = self._og_search_title(webpage, fatal=False) or xpath_text(
+ video, './/title', 'title', fatal=True)
+ description = self._search_regex(
+ r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage) or xpath_text(
+ video, './/cover', 'cover')
+
+ duration = int_or_none(xpath_text(video, './/time', 'duration'))
+ age_limit = int_or_none(xpath_text(video, './/age18', 'age limit'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py
index 596c621d7..06505e96f 100644
--- a/youtube_dl/extractor/planetaplay.py
+++ b/youtube_dl/extractor/planetaplay.py
@@ -18,7 +18,8 @@ class PlanetaPlayIE(InfoExtractor):
'id': '3586',
'ext': 'flv',
'title': 'md5:e829428ee28b1deed00de90de49d1da1',
- }
+ },
+ 'skip': 'Not accessible from Travis CI server',
}
_SONG_FORMATS = {
diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py
index 45716c75d..2856af96f 100644
--- a/youtube_dl/extractor/played.py
+++ b/youtube_dl/extractor/played.py
@@ -5,12 +5,10 @@ import re
import os.path
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
@@ -38,9 +36,7 @@ class PlayedIE(InfoExtractor):
if m_error:
raise ExtractorError(m_error.group('msg'), expected=True)
- fields = re.findall(
- r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage)
- data = dict(fields)
+ data = self._hidden_inputs(orig_webpage)
self._sleep(2, video_id)
@@ -48,7 +44,7 @@ class PlayedIE(InfoExtractor):
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
}
- req = compat_urllib_request.Request(url, post, headers)
+ req = sanitized_Request(url, post, headers)
webpage = self._download_webpage(
req, video_id, note='Downloading video page ...')
diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py
index 9576aed0e..e766ccca3 100644
--- a/youtube_dl/extractor/playfm.py
+++ b/youtube_dl/extractor/playfm.py
@@ -4,85 +4,72 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
- float_or_none,
int_or_none,
- str_to_int,
+ parse_iso8601,
)
class PlayFMIE(InfoExtractor):
IE_NAME = 'play.fm'
- _VALID_URL = r'https?://(?:www\.)?play\.fm/[^?#]*(?P<upload_date>[0-9]{8})(?P<id>[0-9]{6})(?:$|[?#])'
+ _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])'
_TEST = {
- 'url': 'http://www.play.fm/recording/leipzigelectronicmusicbatofarparis_fr20140712137220',
+ 'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12',
'md5': 'c505f8307825a245d0c7ad1850001f22',
'info_dict': {
- 'id': '137220',
+ 'id': '71276',
'ext': 'mp3',
- 'title': 'LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12',
- 'uploader': 'Sven Tasnadi',
- 'uploader_id': 'sventasnadi',
- 'duration': 5627.428,
- 'upload_date': '20140712',
+ 'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12',
+ 'description': '',
+ 'duration': 5627,
+ 'timestamp': 1406033781,
+ 'upload_date': '20140722',
+ 'uploader': 'Dan Drastic',
+ 'uploader_id': '71170',
'view_count': int,
'comment_count': int,
- 'thumbnail': 're:^https?://.*\.jpg$',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- upload_date = mobj.group('upload_date')
-
- rec_data = compat_urllib_parse.urlencode({'rec_id': video_id})
- req = compat_urllib_request.Request(
- 'http://www.play.fm/flexRead/recording', data=rec_data)
- req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- rec_doc = self._download_xml(req, video_id)
+ slug = mobj.group('slug')
- error_node = rec_doc.find('./error')
- if error_node is not None:
- raise ExtractorError('An error occured: %s (code %s)' % (
- error_node.text, rec_doc.find('./status').text))
+ recordings = self._download_json(
+ 'http://v2api.play.fm/recordings/slug/%s' % slug, video_id)
- recording = rec_doc.find('./recording')
- title = recording.find('./title').text
- view_count = str_to_int(recording.find('./stats/playcount').text)
- comment_count = str_to_int(recording.find('./stats/comments').text)
- duration = float_or_none(recording.find('./duration').text, scale=1000)
- thumbnail = recording.find('./image').text
+ error = recordings.get('error')
+ if isinstance(error, dict):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error.get('message')),
+ expected=True)
- artist = recording.find('./artists/artist')
- uploader = artist.find('./name').text
- uploader_id = artist.find('./slug').text
-
- video_url = '%s//%s/%s/%s/offset/0/sh/%s/rec/%s/jingle/%s/loc/%s' % (
- 'http:', recording.find('./url').text,
- recording.find('./_class').text, recording.find('./file_id').text,
- rec_doc.find('./uuid').text, video_id,
- rec_doc.find('./jingle/file_id').text,
- 'http%3A%2F%2Fwww.play.fm%2Fplayer',
- )
+ audio_url = recordings['audio']
+ video_id = compat_str(recordings.get('id') or video_id)
+ title = recordings['title']
+ description = recordings.get('description')
+ duration = int_or_none(recordings.get('recordingDuration'))
+ timestamp = parse_iso8601(recordings.get('created_at'))
+ uploader = recordings.get('page', {}).get('title')
+ uploader_id = compat_str(recordings.get('page', {}).get('id'))
+ view_count = int_or_none(recordings.get('playCount'))
+ comment_count = int_or_none(recordings.get('commentCount'))
+ categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')]
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'mp3',
- 'filesize': int_or_none(recording.find('./size').text),
+ 'url': audio_url,
'title': title,
- 'upload_date': upload_date,
- 'view_count': view_count,
- 'comment_count': comment_count,
+ 'description': description,
'duration': duration,
- 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
'uploader': uploader,
'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'categories': categories,
}
diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py
new file mode 100644
index 000000000..e360404f7
--- /dev/null
+++ b/youtube_dl/extractor/playtvak.py
@@ -0,0 +1,181 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urlparse,
+ compat_urllib_parse,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ qualities,
+)
+
+
+class PlaytvakIE(InfoExtractor):
+ IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz'
+ _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)'
+ _TESTS = [{
+ 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko',
+ 'md5': '4525ae312c324b4be2f4603cc78ceb4a',
+ 'info_dict': {
+ 'id': 'A150730_150323_hodinovy-manzel_kuko',
+ 'ext': 'mp4',
+ 'title': 'Vyžeňte vosy a sršně ze zahrady',
+ 'description': 'md5:f93d398691044d303bc4a3de62f3e976',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'duration': 279,
+ 'timestamp': 1438732860,
+ 'upload_date': '20150805',
+ 'is_live': False,
+ }
+ }, { # live video test
+ 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat',
+ 'info_dict': {
+ 'id': 'A150624_164934_planespotting_cat',
+ 'ext': 'flv',
+ 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ }, { # idnes.cz
+ 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku',
+ 'md5': '819832ba33cd7016e58a6658577fe289',
+ 'info_dict': {
+ 'id': 'A150809_104116_domaci_pku',
+ 'ext': 'mp4',
+ 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se',
+ 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'duration': 39,
+ 'timestamp': 1438969140,
+ 'upload_date': '20150807',
+ 'is_live': False,
+ }
+ }, { # lidovky.cz
+ 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE',
+ 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8',
+ 'info_dict': {
+ 'id': 'A150808_214044_ln-video_ELE',
+ 'ext': 'mp4',
+ 'title': 'Táhni! Demonstrace proti imigrantům budila emoce',
+ 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1439052180,
+ 'upload_date': '20150808',
+ 'is_live': False,
+ }
+ }, { # metro.cz
+ 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row',
+ 'md5': '84fc1deedcac37b7d4a6ccae7c716668',
+ 'info_dict': {
+ 'id': 'A141111_173251_metro-extra_row',
+ 'ext': 'mp4',
+ 'title': 'Recesisté udělali z billboardu kolotoč',
+ 'description': 'md5:7369926049588c3989a66c9c1a043c4c',
+ 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1415725500,
+ 'upload_date': '20141111',
+ 'is_live': False,
+ }
+ }, {
+ 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ info_url = self._html_search_regex(
+ r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url')
+
+ parsed_url = compat_urlparse.urlparse(info_url)
+
+ qs = compat_urlparse.parse_qs(parsed_url.query)
+ qs.update({
+ 'reklama': ['0'],
+ 'type': ['js'],
+ })
+
+ info_url = compat_urlparse.urlunparse(
+ parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
+ json_info = self._download_json(
+ info_url, video_id,
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+ item = None
+ for i in json_info['items']:
+ if i.get('type') == 'video' or i.get('type') == 'stream':
+ item = i
+ break
+ if not item:
+ raise ExtractorError('No suitable stream found')
+
+ quality = qualities(('low', 'middle', 'high'))
+
+ formats = []
+ for fmt in item['video']:
+ video_url = fmt.get('file')
+ if not video_url:
+ continue
+
+ format_ = fmt['format']
+ format_id = '%s_%s' % (format_, fmt['quality'])
+ preference = None
+
+ if format_ in ('mp4', 'webm'):
+ ext = format_
+ elif format_ == 'rtmp':
+ ext = 'flv'
+ elif format_ == 'apple':
+ ext = 'mp4'
+ # Some streams have mp3 audio which does not play
+ # well with ffmpeg filter aac_adtstoasc
+ preference = -1
+ elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests
+ continue
+ else: # Other formats not supported yet
+ continue
+
+ formats.append({
+ 'url': video_url,
+ 'ext': ext,
+ 'format_id': format_id,
+ 'quality': quality(fmt.get('quality')),
+ 'preference': preference,
+ })
+ self._sort_formats(formats)
+
+ title = item['title']
+ is_live = item['type'] == 'stream'
+ if is_live:
+ title = self._live_title(title)
+ description = self._og_search_description(webpage, default=None) or self._html_search_meta(
+ 'description', webpage, 'description')
+ timestamp = None
+ duration = None
+ if not is_live:
+ duration = int_or_none(item.get('length'))
+ timestamp = item.get('published')
+ if timestamp:
+ timestamp = parse_iso8601(timestamp[:-5])
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': item.get('image'),
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'is_live': is_live,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py
index c3e667e9e..2eb4fd96d 100644
--- a/youtube_dl/extractor/playvid.py
+++ b/youtube_dl/extractor/playvid.py
@@ -4,7 +4,8 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
)
from ..utils import (
clean_html,
@@ -44,7 +45,7 @@ class PlayvidIE(InfoExtractor):
flashvars = self._html_search_regex(
r'flashvars="(.+?)"', webpage, 'flashvars')
- infos = compat_urllib_parse.unquote(flashvars).split(r'&')
+ infos = compat_urllib_parse_unquote(flashvars).split(r'&')
for info in infos:
videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
if videovars_match:
@@ -52,7 +53,7 @@ class PlayvidIE(InfoExtractor):
val = videovars_match.group(2)
if key == 'title':
- video_title = compat_urllib_parse.unquote_plus(val)
+ video_title = compat_urllib_parse_unquote_plus(val)
if key == 'duration':
try:
duration = int(val)
diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py
new file mode 100644
index 000000000..6d138ef25
--- /dev/null
+++ b/youtube_dl/extractor/playwire.py
@@ -0,0 +1,78 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ xpath_text,
+ float_or_none,
+ int_or_none,
+)
+
+
+class PlaywireIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json',
+ 'md5': 'e6398701e3595888125729eaa2329ed9',
+ 'info_dict': {
+ 'id': '3353705',
+ 'ext': 'mp4',
+ 'title': 'S04_RM_UCL_Rus',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'duration': 145.94,
+ },
+ }, {
+ 'url': 'http://cdn.playwire.com/11625/embed/85228.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id')
+
+ player = self._download_json(
+ 'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id),
+ video_id)
+
+ title = player['settings']['title']
+ duration = float_or_none(player.get('duration'), 1000)
+
+ content = player['content']
+ thumbnail = content.get('poster')
+ src = content['media']['f4m']
+
+ f4m = self._download_xml(src, video_id)
+ base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True)
+ formats = []
+ for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'):
+ media_url = media.get('url')
+ if not media_url:
+ continue
+ tbr = int_or_none(media.get('bitrate'))
+ width = int_or_none(media.get('width'))
+ height = int_or_none(media.get('height'))
+ f = {
+ 'url': '%s/%s' % (base_url, media.attrib['url']),
+ 'tbr': tbr,
+ 'width': width,
+ 'height': height,
+ }
+ if not (tbr or width or height):
+ f['quality'] = 1 if '-hd.' in media_url else 0
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
new file mode 100644
index 000000000..aa7dbcb63
--- /dev/null
+++ b/youtube_dl/extractor/pluralsight.py
@@ -0,0 +1,275 @@
+from __future__ import unicode_literals
+
+import json
+import random
+import collections
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_duration,
+ sanitized_Request,
+)
+
+
+class PluralsightBaseIE(InfoExtractor):
+ _API_BASE = 'http://app.pluralsight.com'
+
+
+class PluralsightIE(PluralsightBaseIE):
+ IE_NAME = 'pluralsight'
+ _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/training/player\?'
+ _LOGIN_URL = 'https://app.pluralsight.com/id/'
+
+ _NETRC_MACHINE = 'pluralsight'
+
+ _TESTS = [{
+ 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas',
+ 'md5': '4d458cf5cf4c593788672419a8dd4cf8',
+ 'info_dict': {
+ 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04',
+ 'ext': 'mp4',
+ 'title': 'Management of SQL Server - Demo Monitoring',
+ 'duration': 338,
+ },
+ 'skip': 'Requires pluralsight account credentials',
+ }, {
+ 'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live',
+ 'only_matching': True,
+ }, {
+ # available without pluralsight account
+ 'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started',
+ 'only_matching': True,
+ }]
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'Username': username.encode('utf-8'),
+ 'Password': password.encode('utf-8'),
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+ 'post url', default=self._LOGIN_URL, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+ request = sanitized_Request(
+ post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+
+ response = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ error = self._search_regex(
+ r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+
+ if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')):
+ raise ExtractorError('Unable to log in')
+
+ def _real_extract(self, url):
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+
+ author = qs.get('author', [None])[0]
+ name = qs.get('name', [None])[0]
+ clip_id = qs.get('clip', [None])[0]
+ course = qs.get('course', [None])[0]
+
+ if any(not f for f in (author, name, clip_id, course,)):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ display_id = '%s-%s' % (name, clip_id)
+
+ webpage = self._download_webpage(url, display_id)
+
+ modules = self._search_regex(
+ r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)',
+ webpage, 'modules', default=None)
+
+ if modules:
+ collection = self._parse_json(modules, display_id)
+ else:
+ # Webpage may be served in different layout (see
+ # https://github.com/rg3/youtube-dl/issues/7607)
+ collection = self._parse_json(
+ self._search_regex(
+ r'var\s+initialState\s*=\s*({.+?});\n', webpage, 'initial state'),
+ display_id)['course']['modules']
+
+ module, clip = None, None
+
+ for module_ in collection:
+ if name in (module_.get('moduleName'), module_.get('name')):
+ module = module_
+ for clip_ in module_.get('clips', []):
+ clip_index = clip_.get('clipIndex')
+ if clip_index is None:
+ clip_index = clip_.get('index')
+ if clip_index is None:
+ continue
+ if compat_str(clip_index) == clip_id:
+ clip = clip_
+ break
+
+ if not clip:
+ raise ExtractorError('Unable to resolve clip')
+
+ QUALITIES = {
+ 'low': {'width': 640, 'height': 480},
+ 'medium': {'width': 848, 'height': 640},
+ 'high': {'width': 1024, 'height': 768},
+ }
+
+ AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities'])
+
+ ALLOWED_QUALITIES = (
+ AllowedQuality('webm', ('high',)),
+ AllowedQuality('mp4', ('low', 'medium', 'high',)),
+ )
+
+ # In order to minimize the number of calls to ViewClip API and reduce
+ # the probability of being throttled or banned by Pluralsight we will request
+ # only single format until formats listing was explicitly requested.
+ if self._downloader.params.get('listformats', False):
+ allowed_qualities = ALLOWED_QUALITIES
+ else:
+ def guess_allowed_qualities():
+ req_format = self._downloader.params.get('format') or 'best'
+ req_format_split = req_format.split('-')
+ if len(req_format_split) > 1:
+ req_ext, req_quality = req_format_split
+ for allowed_quality in ALLOWED_QUALITIES:
+ if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:
+ return (AllowedQuality(req_ext, (req_quality, )), )
+ req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4'
+ return (AllowedQuality(req_ext, ('high', )), )
+ allowed_qualities = guess_allowed_qualities()
+
+ formats = []
+ for ext, qualities in allowed_qualities:
+ for quality in qualities:
+ f = QUALITIES[quality].copy()
+ clip_post = {
+ 'a': author,
+ 'cap': 'false',
+ 'cn': clip_id,
+ 'course': course,
+ 'lc': 'en',
+ 'm': name,
+ 'mt': ext,
+ 'q': '%dx%d' % (f['width'], f['height']),
+ }
+ request = sanitized_Request(
+ '%s/training/Player/ViewClip' % self._API_BASE,
+ json.dumps(clip_post).encode('utf-8'))
+ request.add_header('Content-Type', 'application/json;charset=utf-8')
+ format_id = '%s-%s' % (ext, quality)
+ clip_url = self._download_webpage(
+ request, display_id, 'Downloading %s URL' % format_id, fatal=False)
+
+ # Pluralsight tracks multiple sequential calls to ViewClip API and start
+ # to return 429 HTTP errors after some time (see
+ # https://github.com/rg3/youtube-dl/pull/6989). Moreover it may even lead
+ # to account ban (see https://github.com/rg3/youtube-dl/issues/6842).
+ # To somewhat reduce the probability of these consequences
+ # we will sleep random amount of time before each call to ViewClip.
+ self._sleep(
+ random.randint(2, 5), display_id,
+ '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling')
+
+ if not clip_url:
+ continue
+ f.update({
+ 'url': clip_url,
+ 'ext': ext,
+ 'format_id': format_id,
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ # TODO: captions
+ # http://www.pluralsight.com/training/Player/ViewClip + cap = true
+ # or
+ # http://www.pluralsight.com/training/Player/Captions
+ # { a = author, cn = clip_id, lc = end, m = name }
+
+ return {
+ 'id': clip['clipName'],
+ 'title': '%s - %s' % (module['title'], clip['title']),
+ 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')),
+ 'creator': author,
+ 'formats': formats
+ }
+
+
+class PluralsightCourseIE(PluralsightBaseIE):
+ IE_NAME = 'pluralsight:course'
+ _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)'
+ _TESTS = [{
+ # Free course from Pluralsight Starter Subscription for Microsoft TechNet
+ # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz
+ 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas',
+ 'info_dict': {
+ 'id': 'hosting-sql-server-windows-azure-iaas',
+ 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals',
+ 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986',
+ },
+ 'playlist_count': 31,
+ }, {
+ # available without pluralsight account
+ 'url': 'https://www.pluralsight.com/courses/angularjs-get-started',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ # TODO: PSM cookie
+
+ course = self._download_json(
+ '%s/data/course/%s' % (self._API_BASE, course_id),
+ course_id, 'Downloading course JSON')
+
+ title = course['title']
+ description = course.get('description') or course.get('shortDescription')
+
+ course_data = self._download_json(
+ '%s/data/course/content/%s' % (self._API_BASE, course_id),
+ course_id, 'Downloading course data JSON')
+
+ entries = []
+ for module in course_data:
+ for clip in module.get('clips', []):
+ player_parameters = clip.get('playerParameters')
+ if not player_parameters:
+ continue
+ entries.append(self.url_result(
+ '%s/training/player?%s' % (self._API_BASE, player_parameters),
+ 'Pluralsight'))
+
+ return self.playlist_result(entries, course_id, title, description)
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
new file mode 100644
index 000000000..3e15533e9
--- /dev/null
+++ b/youtube_dl/extractor/porn91.py
@@ -0,0 +1,73 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from ..compat import compat_urllib_parse
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ int_or_none,
+ ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+ IE_NAME = '91porn'
+ _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+ _TEST = {
+ 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+ 'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+ 'info_dict': {
+ 'id': '7e42283b4f5ab36da134',
+ 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
+ 'ext': 'mp4',
+ 'duration': 431,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
+ self._set_cookie('91porn.com', 'language', 'cn_CN')
+ webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+ if '作为游客,你每天只可观看10个视频' in webpage:
+ raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+ title = self._search_regex(
+ r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+ title = title.replace('\n', '')
+
+ # get real url
+ file_id = self._search_regex(
+ r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
+ sec_code = self._search_regex(
+ r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
+ max_vid = self._search_regex(
+ r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
+ url_params = compat_urllib_parse.urlencode({
+ 'VID': file_id,
+ 'mp4': '1',
+ 'seccode': sec_code,
+ 'max_vid': max_vid,
+ })
+ info_cn = self._download_webpage(
+ 'http://91porn.com/getfile.php?' + url_params, video_id,
+ 'get real video url')
+ video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+
+ duration = parse_duration(self._search_regex(
+ r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+ comment_count = int_or_none(self._search_regex(
+ r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'comment_count': comment_count,
+ 'age_limit': self._rta_search(webpage),
+ }
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index dbb2c3bd9..57c78ba52 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -36,7 +36,8 @@ class PornHdIE(InfoExtractor):
webpage = self._download_webpage(url, display_id or video_id)
title = self._html_search_regex(
- r'<title>(.+) porn HD.+?</title>', webpage, 'title')
+ [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)',
+ r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
description = self._html_search_regex(
r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
view_count = int_or_none(self._html_search_regex(
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index fb2032832..965940a4b 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -5,12 +5,13 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlparse,
- compat_urllib_request,
)
from ..utils import (
ExtractorError,
+ sanitized_Request,
str_to_int,
)
from ..aes import (
@@ -19,8 +20,8 @@ from ..aes import (
class PornHubIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P<id>[0-9a-f]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+ _TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': '882f488fa1f0026f023f33576004a2ed',
'info_dict': {
@@ -30,18 +31,30 @@ class PornHubIE(InfoExtractor):
"title": "Seductive Indian beauty strips down and fingers her pink pussy",
"age_limit": 18
}
- }
+ }, {
+ 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
def _extract_count(self, pattern, webpage, name):
- count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False)
- if count:
- count = str_to_int(count)
- return count
+ return str_to_int(self._search_regex(
+ pattern, webpage, '%s count' % name, fatal=False))
def _real_extract(self, url):
video_id = self._match_id(url)
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(
+ 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
@@ -56,21 +69,25 @@ class PornHubIE(InfoExtractor):
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
video_uploader = self._html_search_regex(
- r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|<span class="username)[^>]+>(.+?)<',
+ r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
- thumbnail = compat_urllib_parse.unquote(thumbnail)
+ thumbnail = compat_urllib_parse_unquote(thumbnail)
- view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
- like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
- dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+ view_count = self._extract_count(
+ r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
+ like_count = self._extract_count(
+ r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
+ dislike_count = self._extract_count(
+ r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
comment_count = self._extract_count(
- r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
+ r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
- video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
+ video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage)))
if webpage.find('"encrypted":true') != -1:
- password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
+ password = compat_urllib_parse_unquote_plus(
+ self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
formats = []
@@ -80,7 +97,7 @@ class PornHubIE(InfoExtractor):
format = path.split('/')[5].split('_')[:2]
format = "-".join(format)
- m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format)
+ m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
if m is None:
height = None
tbr = None
@@ -110,3 +127,33 @@ class PornHubIE(InfoExtractor):
'formats': formats,
'age_limit': 18,
}
+
+
+class PornHubPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.pornhub.com/playlist/6201671',
+ 'info_dict': {
+ 'id': '6201671',
+ 'title': 'P0p4',
+ },
+ 'playlist_mincount': 35,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('http://www.pornhub.com/%s' % video_url, 'PornHub')
+ for video_url in set(re.findall('href="/?(view_video\.php\?viewkey=\d+[^"]*)"', webpage))
+ ]
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
+ playlist_id)
+
+ return self.playlist_result(
+ entries, playlist_id, playlist.get('title'), playlist.get('description'))
diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py
index 34735c51e..5398e708b 100644
--- a/youtube_dl/extractor/pornotube.py
+++ b/youtube_dl/extractor/pornotube.py
@@ -3,11 +3,9 @@ from __future__ import unicode_literals
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
int_or_none,
+ sanitized_Request,
)
@@ -46,7 +44,7 @@ class PornotubeIE(InfoExtractor):
'authenticationSpaceKey': originAuthenticationSpaceKey,
'credentials': 'Clip Application',
}
- token_req = compat_urllib_request.Request(
+ token_req = sanitized_Request(
'https://api.aebn.net/auth/v1/token/primal',
data=json.dumps(token_req_data).encode('utf-8'))
token_req.add_header('Content-Type', 'application/json')
@@ -56,7 +54,7 @@ class PornotubeIE(InfoExtractor):
token = token_answer['tokenKey']
# Get video URL
- delivery_req = compat_urllib_request.Request(
+ delivery_req = sanitized_Request(
'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id)
delivery_req.add_header('Authorization', token)
delivery_info = self._download_json(
@@ -64,7 +62,7 @@ class PornotubeIE(InfoExtractor):
video_url = delivery_info['mediaUrl']
# Get additional info (title etc.)
- info_req = compat_urllib_request.Request(
+ info_req = sanitized_Request(
'https://api.aebn.net/content/v1/clips/%s?expand='
'title,description,primaryImageNumber,startSecond,endSecond,'
'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,'
diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py
new file mode 100644
index 000000000..eba4dfbb3
--- /dev/null
+++ b/youtube_dl/extractor/pornovoisines.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import random
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ unified_strdate,
+)
+
+
+class PornoVoisinesIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)'
+
+ _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \
+ '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4'
+
+ _SERVER_NUMBERS = (1, 2)
+
+ _TEST = {
+ 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/',
+ 'md5': '5ac670803bc12e9e7f9f662ce64cf1d1',
+ 'info_dict': {
+ 'id': '1285',
+ 'display_id': 'recherche-appartement',
+ 'ext': 'mp4',
+ 'title': 'Recherche appartement',
+ 'description': 'md5:819ea0b785e2a04667a1a01cdc89594e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140925',
+ 'duration': 120,
+ 'view_count': int,
+ 'average_rating': float,
+ 'categories': ['Débutantes', 'Scénario', 'Sodomie'],
+ 'age_limit': 18,
+ }
+ }
+
+ @classmethod
+ def build_video_url(cls, num):
+ return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self.build_video_url(video_id)
+
+ title = self._html_search_regex(
+ r'<h1>(.+?)</h1>', webpage, 'title', flags=re.DOTALL)
+ description = self._html_search_regex(
+ r'<article id="descriptif">(.+?)</article>',
+ webpage, "description", fatal=False, flags=re.DOTALL)
+
+ thumbnail = self._search_regex(
+ r'<div id="mediaspace%s">\s*<img src="/?([^"]+)"' % video_id,
+ webpage, 'thumbnail', fatal=False)
+ if thumbnail:
+ thumbnail = 'http://www.pornovoisines.com/%s' % thumbnail
+
+ upload_date = unified_strdate(self._search_regex(
+ r'Publié le ([\d-]+)', webpage, 'upload date', fatal=False))
+ duration = int_or_none(self._search_regex(
+ 'Durée (\d+)', webpage, 'duration', fatal=False))
+ view_count = int_or_none(self._search_regex(
+ r'(\d+) vues', webpage, 'view count', fatal=False))
+ average_rating = self._search_regex(
+ r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
+ if average_rating:
+ average_rating = float_or_none(average_rating.replace(',', '.'))
+
+ categories = self._html_search_meta(
+ 'keywords', webpage, 'categories', fatal=False)
+ if categories:
+ categories = [category.strip() for category in categories.split(',')]
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'average_rating': average_rating,
+ 'categories': categories,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py
new file mode 100644
index 000000000..85aae9576
--- /dev/null
+++ b/youtube_dl/extractor/primesharetv.py
@@ -0,0 +1,62 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ sanitized_Request,
+)
+
+
+class PrimeShareTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?primeshare\.tv/download/(?P<id>[\da-zA-Z]+)'
+
+ _TEST = {
+ 'url': 'http://primeshare.tv/download/238790B611',
+ 'md5': 'b92d9bf5461137c36228009f31533fbc',
+ 'info_dict': {
+ 'id': '238790B611',
+ 'ext': 'mp4',
+ 'title': 'Public Domain - 1960s Commercial - Crest Toothpaste-YKsuFona',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ if '>File not exist<' in webpage:
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+ fields = self._hidden_inputs(webpage)
+
+ headers = {
+ 'Referer': url,
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ }
+
+ wait_time = int(self._search_regex(
+ r'var\s+cWaitTime\s*=\s*(\d+)',
+ webpage, 'wait time', default=7)) + 1
+ self._sleep(wait_time, video_id)
+
+ req = sanitized_Request(
+ url, compat_urllib_parse.urlencode(fields), headers)
+ video_page = self._download_webpage(
+ req, video_id, 'Downloading video page')
+
+ video_url = self._search_regex(
+ r"url\s*:\s*'([^']+\.primeshare\.tv(?::443)?/file/[^']+)'",
+ video_page, 'video url')
+
+ title = self._html_search_regex(
+ r'<h1>Watch\s*(?:&nbsp;)?\s*\((.+?)(?:\s*\[\.\.\.\])?\)\s*(?:&nbsp;)?\s*<strong>',
+ video_page, 'title')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'ext': 'mp4',
+ }
diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py
index f536e6e6c..d5357283a 100644
--- a/youtube_dl/extractor/promptfile.py
+++ b/youtube_dl/extractor/promptfile.py
@@ -4,13 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
determine_ext,
ExtractorError,
+ sanitized_Request,
)
@@ -35,12 +33,9 @@ class PromptFileIE(InfoExtractor):
raise ExtractorError('Video %s does not exist' % video_id,
expected=True)
- fields = dict(re.findall(r'''(?x)type="hidden"\s+
- name="(.+?)"\s+
- value="(.*?)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
+ req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
webpage = self._download_webpage(
req, video_id, 'Downloading video page')
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 385681d06..baa54a3af 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -9,6 +9,10 @@ from ..compat import (
compat_urllib_parse,
)
from ..utils import (
+ ExtractorError,
+ determine_ext,
+ float_or_none,
+ int_or_none,
unified_strdate,
)
@@ -16,15 +20,20 @@ from ..utils import (
class ProSiebenSat1IE(InfoExtractor):
IE_NAME = 'prosiebensat1'
IE_DESC = 'ProSiebenSat.1 Digital'
- _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)'
_TESTS = [
{
+ # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242
+ # in response to fixing https://github.com/rg3/youtube-dl/issues/6215:
+ # - malformed f4m manifest support
+ # - proper handling of URLs starting with `https?://` in 2.0 manifests
+ # - recursive child f4m manifests extraction
'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
'info_dict': {
'id': '2104602',
'ext': 'mp4',
- 'title': 'Staffel 2, Episode 18 - Jahresrückblick',
+ 'title': 'Episode 18 - Staffel 2',
'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
'upload_date': '20131231',
'duration': 5845.04,
@@ -176,6 +185,7 @@ class ProSiebenSat1IE(InfoExtractor):
r'<header class="clearfix">\s*<h3>(.+?)</h3>',
r'<!-- start video -->\s*<h1>(.+?)</h1>',
r'<h1 class="att-name">\s*(.+?)</h1>',
+ r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
]
_DESCRIPTION_REGEXES = [
r'<p itemprop="description">\s*(.+?)</p>',
@@ -205,8 +215,8 @@ class ProSiebenSat1IE(InfoExtractor):
def _extract_clip(self, url, webpage):
clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
- access_token = 'testclient'
- client_name = 'kolibri-1.2.5'
+ access_token = 'prosieben'
+ client_name = 'kolibri-2.0.19-splec4'
client_location = url
videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
@@ -216,10 +226,13 @@ class ProSiebenSat1IE(InfoExtractor):
'ids': clip_id,
})
- videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
+ video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0]
- duration = float(videos[0]['duration'])
- source_ids = [source['id'] for source in videos[0]['sources']]
+ if video.get('is_protected') is True:
+ raise ExtractorError('This video is DRM protected.', expected=True)
+
+ duration = float_or_none(video.get('duration'))
+ source_ids = [source['id'] for source in video['sources']]
source_ids_str = ','.join(map(str, source_ids))
g = '01!8d8F_)r9]4s[qeuXfP%'
@@ -266,27 +279,37 @@ class ProSiebenSat1IE(InfoExtractor):
urls_sources = urls_sources.values()
def fix_bitrate(bitrate):
+ bitrate = int_or_none(bitrate)
+ if not bitrate:
+ return None
return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
for source in urls_sources:
protocol = source['protocol']
+ source_url = source['url']
if protocol == 'rtmp' or protocol == 'rtmpe':
- mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+ mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
if not mobj:
continue
+ path = mobj.group('path')
+ mp4colon_index = path.rfind('mp4:')
+ app = path[:mp4colon_index]
+ play_path = path[mp4colon_index:]
formats.append({
- 'url': mobj.group('url'),
- 'app': mobj.group('app'),
- 'play_path': mobj.group('playpath'),
+ 'url': '%s/%s' % (mobj.group('url'), app),
+ 'app': app,
+ 'play_path': play_path,
'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
'page_url': 'http://www.prosieben.de',
'vbr': fix_bitrate(source['bitrate']),
'ext': 'mp4',
'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
})
+ elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(source_url, clip_id))
else:
formats.append({
- 'url': source['url'],
+ 'url': source_url,
'vbr': fix_bitrate(source['bitrate']),
})
diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py
new file mode 100644
index 000000000..cce84b9e4
--- /dev/null
+++ b/youtube_dl/extractor/puls4.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+ int_or_none,
+)
+
+
+class Puls4IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816',
+ 'md5': '49f6a6629747eeec43cef6a46b5df81d',
+ 'info_dict': {
+ 'id': '2716816',
+ 'ext': 'mp4',
+ 'title': 'Pro und Contra vom 23.02.2015',
+ 'description': 'md5:293e44634d9477a67122489994675db6',
+ 'duration': 2989,
+ 'upload_date': '20150224',
+ 'uploader': 'PULS_4',
+ },
+ 'skip': 'Only works from Germany',
+ }, {
+ 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106',
+ 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec',
+ 'info_dict': {
+ 'id': '1298106',
+ 'ext': 'mp4',
+ 'title': 'Lucky Fritz',
+ },
+ 'skip': 'Only works from Germany',
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ error_message = self._html_search_regex(
+ r'<div class="message-error">(.+?)</div>',
+ webpage, 'error message', default=None)
+ if error_message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
+ real_url = self._html_search_regex(
+ r'\"fsk-button\".+?href=\"([^"]+)',
+ webpage, 'fsk_button', default=None)
+ if real_url:
+ webpage = self._download_webpage(real_url, video_id)
+
+ player = self._search_regex(
+ r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}',
+ webpage, 'player')
+
+ player_json = self._parse_json(
+ '[%s]' % player, video_id,
+ transform_source=lambda s: s.replace('undefined,', ''))
+
+ formats = None
+ result = None
+
+ for v in player_json:
+ if isinstance(v, list) and not formats:
+ formats = [{
+ 'url': f['url'],
+ 'format': 'hd' if f.get('hd') else 'sd',
+ 'width': int_or_none(f.get('size_x')),
+ 'height': int_or_none(f.get('size_y')),
+ 'tbr': int_or_none(f.get('bitrate')),
+ } for f in v]
+ self._sort_formats(formats)
+ elif isinstance(v, dict) and not result:
+ result = {
+ 'id': video_id,
+ 'title': v['videopartname'].strip(),
+ 'description': v.get('videotitle'),
+ 'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')),
+ 'upload_date': unified_strdate(v.get('clipreleasetime')),
+ 'uploader': v.get('channel'),
+ }
+
+ result['formats'] = formats
+
+ return result
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
new file mode 100644
index 000000000..1ba3bbddf
--- /dev/null
+++ b/youtube_dl/extractor/qqmusic.py
@@ -0,0 +1,344 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import time
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ sanitized_Request,
+ strip_jsonp,
+ unescapeHTML,
+ clean_html,
+)
+
+
+class QQMusicIE(InfoExtractor):
+ IE_NAME = 'qqmusic'
+ IE_DESC = 'QQ音乐'
+ _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
+ _TESTS = [{
+ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
+ 'md5': '9ce1c1c8445f561506d2e3cfb0255705',
+ 'info_dict': {
+ 'id': '004295Et37taLD',
+ 'ext': 'mp3',
+ 'title': '可惜没如果',
+ 'release_date': '20141227',
+ 'creator': '林俊杰',
+ 'description': 'md5:d327722d0361576fde558f1ac68a7065',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'note': 'There is no mp3-320 version of this song.',
+ 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV',
+ 'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
+ 'info_dict': {
+ 'id': '004MsGEo3DdNxV',
+ 'ext': 'mp3',
+ 'title': '如果',
+ 'release_date': '20050626',
+ 'creator': '李季美',
+ 'description': 'md5:46857d5ed62bc4ba84607a805dccf437',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'note': 'lyrics not in .lrc format',
+ 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6',
+ 'info_dict': {
+ 'id': '001JyApY11tIp6',
+ 'ext': 'mp3',
+ 'title': 'Shadows Over Transylvania',
+ 'release_date': '19970225',
+ 'creator': 'Dark Funeral',
+ 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ _FORMATS = {
+ 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+ 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+ 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+ }
+
+ # Reference: m_r_GetRUin() in top_player.js
+ # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
+ @staticmethod
+ def m_r_get_ruin():
+ curMs = int(time.time() * 1000) % 1000
+ return int(round(random.random() * 2147483647) * curMs % 1E10)
+
+ def _real_extract(self, url):
+ mid = self._match_id(url)
+
+ detail_info_page = self._download_webpage(
+ 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,
+ mid, note='Download song detail info',
+ errnote='Unable to get song detail info', encoding='gbk')
+
+ song_name = self._html_search_regex(
+ r"songname:\s*'([^']+)'", detail_info_page, 'song name')
+
+ publish_time = self._html_search_regex(
+ r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page,
+ 'publish time', default=None)
+ if publish_time:
+ publish_time = publish_time.replace('-', '')
+
+ singer = self._html_search_regex(
+ r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None)
+
+ lrc_content = self._html_search_regex(
+ r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',
+ detail_info_page, 'LRC lyrics', default=None)
+ if lrc_content:
+ lrc_content = lrc_content.replace('\\n', '\n')
+
+ thumbnail_url = None
+ albummid = self._search_regex(
+ [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
+ detail_info_page, 'album mid', default=None)
+ if albummid:
+ thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \
+ % (albummid[-2:-1], albummid[-1], albummid)
+
+ guid = self.m_r_get_ruin()
+
+ vkey = self._download_json(
+ 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
+ mid, note='Retrieve vkey', errnote='Unable to get vkey',
+ transform_source=strip_jsonp)['key']
+
+ formats = []
+ for format_id, details in self._FORMATS.items():
+ formats.append({
+ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+ % (details['prefix'], mid, details['ext'], vkey, guid),
+ 'format': format_id,
+ 'format_id': format_id,
+ 'preference': details['preference'],
+ 'abr': details.get('abr'),
+ })
+ self._check_formats(formats, mid)
+ self._sort_formats(formats)
+
+ actual_lrc_lyrics = ''.join(
+ line + '\n' for line in re.findall(
+ r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content))
+
+ info_dict = {
+ 'id': mid,
+ 'formats': formats,
+ 'title': song_name,
+ 'release_date': publish_time,
+ 'creator': singer,
+ 'description': lrc_content,
+ 'thumbnail': thumbnail_url
+ }
+ if actual_lrc_lyrics:
+ info_dict['subtitles'] = {
+ 'origin': [{
+ 'ext': 'lrc',
+ 'data': actual_lrc_lyrics,
+ }]
+ }
+ return info_dict
+
+
+class QQPlaylistBaseIE(InfoExtractor):
+ @staticmethod
+ def qq_static_url(category, mid):
+ return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid)
+
+ @classmethod
+ def get_entries_from_page(cls, page):
+ entries = []
+
+ for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page):
+ song_mid = unescapeHTML(item).split('|')[-5]
+ entries.append(cls.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
+ song_mid))
+
+ return entries
+
+
+class QQMusicSingerIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:singer'
+ IE_DESC = 'QQ音乐 - 歌手'
+ _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
+ _TEST = {
+ 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
+ 'info_dict': {
+ 'id': '001BLpXF2DyJe2',
+ 'title': '林俊杰',
+ 'description': 'md5:2a222d89ba4455a3af19940c0481bb78',
+ },
+ 'playlist_count': 12,
+ }
+
+ def _real_extract(self, url):
+ mid = self._match_id(url)
+
+ singer_page = self._download_webpage(
+ self.qq_static_url('singer', mid), mid, 'Download singer page')
+
+ entries = self.get_entries_from_page(singer_page)
+
+ singer_name = self._html_search_regex(
+ r"singername\s*:\s*'([^']+)'", singer_page, 'singer name',
+ default=None)
+
+ singer_id = self._html_search_regex(
+ r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id',
+ default=None)
+
+ singer_desc = None
+
+ if singer_id:
+ req = sanitized_Request(
+ 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id)
+ req.add_header(
+ 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html')
+ singer_desc_page = self._download_xml(
+ req, mid, 'Donwload singer description XML')
+
+ singer_desc = singer_desc_page.find('./data/info/desc').text
+
+ return self.playlist_result(entries, mid, singer_name, singer_desc)
+
+
+class QQMusicAlbumIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:album'
+ IE_DESC = 'QQ音乐 - 专辑'
+ _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
+
+ _TESTS = [{
+ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
+ 'info_dict': {
+ 'id': '000gXCTb2AhRR1',
+ 'title': '我们都是这样长大的',
+ 'description': 'md5:179c5dce203a5931970d306aa9607ea6',
+ },
+ 'playlist_count': 4,
+ }, {
+ 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3',
+ 'info_dict': {
+ 'id': '002Y5a3b3AlCu3',
+ 'title': '그리고...',
+ 'description': 'md5:a48823755615508a95080e81b51ba729',
+ },
+ 'playlist_count': 8,
+ }]
+
+ def _real_extract(self, url):
+ mid = self._match_id(url)
+
+ album = self._download_json(
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid,
+ mid, 'Download album page')['data']
+
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+ ) for song in album['list']
+ ]
+ album_name = album.get('name')
+ album_detail = album.get('desc')
+ if album_detail is not None:
+ album_detail = album_detail.strip()
+
+ return self.playlist_result(entries, mid, album_name, album_detail)
+
+
+class QQMusicToplistIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:toplist'
+ IE_DESC = 'QQ音乐 - 排行榜'
+ _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+
+ _TESTS = [{
+ 'url': 'http://y.qq.com/#type=toplist&p=global_123',
+ 'info_dict': {
+ 'id': 'global_123',
+ 'title': '美国iTunes榜',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'http://y.qq.com/#type=toplist&p=top_3',
+ 'info_dict': {
+ 'id': 'top_3',
+ 'title': 'QQ音乐巅峰榜·欧美',
+ 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
+ '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
+ '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
+ '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
+ },
+ 'playlist_count': 100,
+ }, {
+ 'url': 'http://y.qq.com/#type=toplist&p=global_106',
+ 'info_dict': {
+ 'id': 'global_106',
+ 'title': '韩国Mnet榜',
+ },
+ 'playlist_count': 50,
+ }]
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ list_type, num_id = list_id.split("_")
+
+ toplist_json = self._download_json(
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
+ % (list_type, num_id),
+ list_id, 'Download toplist page')
+
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
+ ) for song in toplist_json['songlist']
+ ]
+
+ topinfo = toplist_json.get('topinfo', {})
+ list_name = topinfo.get('ListName')
+ list_description = topinfo.get('info')
+ return self.playlist_result(entries, list_id, list_name, list_description)
+
+
+class QQMusicPlaylistIE(QQPlaylistBaseIE):
+ IE_NAME = 'qqmusic:playlist'
+ IE_DESC = 'QQ音乐 - 歌单'
+ _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://y.qq.com/#type=taoge&id=3462654915',
+ 'info_dict': {
+ 'id': '3462654915',
+ 'title': '韩国5月新歌精选下旬',
+ 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4',
+ },
+ 'playlist_count': 40,
+ }
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ list_json = self._download_json(
+ 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
+ % list_id, list_id, 'Download list page',
+ transform_source=strip_jsonp)['cdlist'][0]
+
+ entries = [
+ self.url_result(
+ 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+ ) for song in list_json['songlist']
+ ]
+
+ list_name = list_json.get('dissname')
+ list_description = clean_html(unescapeHTML(list_json.get('desc')))
+ return self.playlist_result(entries, list_id, list_name, list_description)
diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py
index af7d76cf4..f414e2384 100644
--- a/youtube_dl/extractor/quickvid.py
+++ b/youtube_dl/extractor/quickvid.py
@@ -24,6 +24,7 @@ class QuickVidIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$',
'view_count': int,
},
+ 'skip': 'Not accessible from Travis CI server',
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py
new file mode 100644
index 000000000..976c8feec
--- /dev/null
+++ b/youtube_dl/extractor/r7.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ unescapeHTML,
+ int_or_none,
+)
+
+
+class R7IE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://
+ (?:
+ (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
+ noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
+ player\.r7\.com/video/i/
+ )
+ (?P<id>[\da-f]{24})
+ '''
+ _TESTS = [{
+ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
+ 'md5': '403c4e393617e8e8ddc748978ee8efde',
+ 'info_dict': {
+ 'id': '54e7050b0cf2ff57e0279389',
+ 'ext': 'mp4',
+ 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 98,
+ 'like_count': int,
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://player.r7.com/video/i/%s' % video_id, video_id)
+
+ item = self._parse_json(js_to_json(self._search_regex(
+ r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
+
+ title = unescapeHTML(item['title'])
+ thumbnail = item.get('init', {}).get('thumbUri')
+ duration = None
+
+ statistics = item.get('statistics', {})
+ like_count = int_or_none(statistics.get('likes'))
+ view_count = int_or_none(statistics.get('views'))
+
+ formats = []
+ for format_key, format_dict in item['playlist'][0].items():
+ src = format_dict.get('src')
+ if not src:
+ continue
+ format_id = format_dict.get('format') or format_key
+ if duration is None:
+ duration = format_dict.get('duration')
+ if '.f4m' in src:
+ formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
+ elif src.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
+ else:
+ formats.append({
+ 'url': src,
+ 'format_id': format_id,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'like_count': like_count,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py
index f95bc9454..aa5f6f8ad 100644
--- a/youtube_dl/extractor/radiode.py
+++ b/youtube_dl/extractor/radiode.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import json
-
from .common import InfoExtractor
@@ -10,13 +8,13 @@ class RadioDeIE(InfoExtractor):
_VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)'
_TEST = {
'url': 'http://ndr2.radio.de/',
- 'md5': '3b4cdd011bc59174596b6145cda474a4',
'info_dict': {
'id': 'ndr2',
'ext': 'mp3',
'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:591c49c702db1a33751625ebfb67f273',
'thumbnail': 're:^https?://.*\.png',
+ 'is_live': True,
},
'params': {
'skip_download': True,
@@ -25,16 +23,15 @@ class RadioDeIE(InfoExtractor):
def _real_extract(self, url):
radio_id = self._match_id(url)
-
webpage = self._download_webpage(url, radio_id)
+ jscode = self._search_regex(
+ r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n",
+ webpage, 'broadcast')
- broadcast = json.loads(self._search_regex(
- r'_getBroadcast\s*=\s*function\(\s*\)\s*{\s*return\s+({.+?})\s*;\s*}',
- webpage, 'broadcast'))
-
+ broadcast = self._parse_json(jscode, radio_id)
title = self._live_title(broadcast['name'])
description = broadcast.get('description') or broadcast.get('shortDescription')
- thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl')
+ thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100')
formats = [{
'url': stream['streamUrl'],
diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py
new file mode 100644
index 000000000..884c28420
--- /dev/null
+++ b/youtube_dl/extractor/radiojavan.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import(
+ unified_strdate,
+ str_to_int,
+)
+
+
+class RadioJavanIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?'
+ _TEST = {
+ 'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam',
+ 'md5': 'e85208ffa3ca8b83534fca9fe19af95b',
+ 'info_dict': {
+ 'id': 'chaartaar-ashoobam',
+ 'ext': 'mp4',
+ 'title': 'Chaartaar - Ashoobam',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ 'upload_date': '20150215',
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ formats = [{
+ 'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path,
+ 'format_id': '%sp' % height,
+ 'height': int(height),
+ } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)]
+ self._sort_formats(formats)
+
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ upload_date = unified_strdate(self._search_regex(
+ r'class="date_added">Date added: ([^<]+)<',
+ webpage, 'upload date', fatal=False))
+
+ view_count = str_to_int(self._search_regex(
+ r'class="views">Plays: ([\d,]+)',
+ webpage, 'view count', fatal=False))
+ like_count = str_to_int(self._search_regex(
+ r'class="rating">([\d,]+) likes',
+ webpage, 'like count', fatal=False))
+ dislike_count = str_to_int(self._search_regex(
+ r'class="rating">([\d,]+) dislikes',
+ webpage, 'dislike count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index aa26b7e0b..7ff1d06c4 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -2,9 +2,10 @@ from __future__ import unicode_literals
import re
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urlparse,
)
from ..utils import (
parse_duration,
@@ -12,8 +13,8 @@ from ..utils import (
)
-class RaiIE(SubtitlesInfoExtractor):
- _VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)'
+class RaiIE(InfoExtractor):
+ _VALID_URL = r'(?P<url>(?P<host>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it))/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)'
_TESTS = [
{
'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
@@ -62,42 +63,93 @@ class RaiIE(SubtitlesInfoExtractor):
'description': 'Edizione delle ore 20:30 ',
}
},
+ {
+ 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
+ 'md5': '02b64456f7cc09f96ff14e7dd489017e',
+ 'info_dict': {
+ 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
+ 'ext': 'flv',
+ 'title': 'Il Candidato - Primo episodio: "Le Primarie"',
+ 'description': 'Primo appuntamento con "Il candidato" con Filippo Timi, alias Piero Zucca presidente!',
+ 'uploader': 'RaiTre',
+ }
+ },
+ {
+ 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
+ 'md5': '037104d2c14132887e5e4cf114569214',
+ 'info_dict': {
+ 'id': '0c7a664b-d0f4-4b2c-8835-3f82e46f433e',
+ 'ext': 'flv',
+ 'title': 'Il pacco',
+ 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
+ 'uploader': 'RaiTre',
+ 'upload_date': '20141221',
+ },
+ }
]
+ def _extract_relinker_url(self, webpage):
+ return self._proto_relative_url(self._search_regex(
+ [r'name="videourl" content="([^"]+)"', r'var\s+videoURL(?:_MP4)?\s*=\s*"([^"]+)"'],
+ webpage, 'relinker url', default=None))
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+ host = mobj.group('host')
- media = self._download_json('%s?json' % mobj.group('url'), video_id, 'Downloading video JSON')
+ webpage = self._download_webpage(url, video_id)
- title = media.get('name')
- description = media.get('desc')
- thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image')
- duration = parse_duration(media.get('length'))
- uploader = media.get('author')
- upload_date = unified_strdate(media.get('date'))
+ relinker_url = self._extract_relinker_url(webpage)
- formats = []
+ if not relinker_url:
+ iframe_url = self._search_regex(
+ [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
+ r'drawMediaRaiTV\(["\'](.+?)["\']'],
+ webpage, 'iframe')
+ if not iframe_url.startswith('http'):
+ iframe_url = compat_urlparse.urljoin(url, iframe_url)
+ webpage = self._download_webpage(
+ iframe_url, video_id)
+ relinker_url = self._extract_relinker_url(webpage)
- for format_id in ['wmv', 'm3u8', 'mediaUri', 'h264']:
- media_url = media.get(format_id)
- if not media_url:
- continue
- formats.append({
+ relinker = self._download_json(
+ '%s&output=47' % relinker_url, video_id)
+
+ media_url = relinker['video'][0]
+ ct = relinker.get('ct')
+ if ct == 'f4m':
+ formats = self._extract_f4m_formats(
+ media_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id)
+ else:
+ formats = [{
'url': media_url,
- 'format_id': format_id,
- 'ext': 'mp4',
- })
+ 'format_id': ct,
+ }]
- if self._downloader.params.get('listsubtitles', False):
- page = self._download_webpage(url, video_id)
- self._list_available_subtitles(video_id, page)
- return
+ json_link = self._html_search_meta(
+ 'jsonlink', webpage, 'JSON link', default=None)
+ if json_link:
+ media = self._download_json(
+ host + json_link, video_id, 'Downloading video JSON')
+ title = media.get('name')
+ description = media.get('desc')
+ thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image')
+ duration = parse_duration(media.get('length'))
+ uploader = media.get('author')
+ upload_date = unified_strdate(media.get('date'))
+ else:
+ title = (self._search_regex(
+ r'var\s+videoTitolo\s*=\s*"(.+?)";',
+ webpage, 'title', default=None) or self._og_search_title(webpage)).replace('\\"', '"')
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = None
+ uploader = self._html_search_meta('Editore', webpage, 'uploader')
+ upload_date = unified_strdate(self._html_search_meta(
+ 'item-date', webpage, 'upload date', default=None))
- subtitles = {}
- if self._have_to_download_any_subtitles:
- page = self._download_webpage(url, video_id)
- subtitles = self.extract_subtitles(video_id, page)
+ subtitles = self.extract_subtitles(video_id, webpage)
return {
'id': video_id,
@@ -111,7 +163,7 @@ class RaiIE(SubtitlesInfoExtractor):
'subtitles': subtitles,
}
- def _get_available_subtitles(self, video_id, webpage):
+ def _get_subtitles(self, video_id, webpage):
subtitles = {}
m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage)
if m:
@@ -120,5 +172,8 @@ class RaiIE(SubtitlesInfoExtractor):
SRT_EXT = '.srt'
if captions.endswith(STL_EXT):
captions = captions[:-len(STL_EXT)] + SRT_EXT
- subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions)
+ subtitles['it'] = [{
+ 'ext': 'srt',
+ 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions),
+ }]
return subtitles
diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py
new file mode 100644
index 000000000..796adfdf9
--- /dev/null
+++ b/youtube_dl/extractor/rds.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_iso8601,
+)
+
+
+class RDSIE(InfoExtractor):
+ IE_DESC = 'RDS.ca'
+ _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
+
+ _TESTS = [{
+ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
+ 'info_dict': {
+ 'id': '3.1132799',
+ 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
+ 'ext': 'mp4',
+ 'title': 'Fowler Jr. prend la direction de Jacksonville',
+ 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ',
+ 'timestamp': 1430397346,
+ 'upload_date': '20150430',
+ 'duration': 154.354,
+ 'age_limit': 0,
+ }
+ }, {
+ 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ # TODO: extract f4m from 9c9media.com
+ video_url = self._search_regex(
+ r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
+ webpage, 'video url')
+
+ title = self._og_search_title(webpage) or self._html_search_meta(
+ 'title', webpage, 'title', fatal=True)
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
+ [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
+ r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
+ webpage, 'thumbnail', fatal=False)
+ timestamp = parse_iso8601(self._search_regex(
+ r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"',
+ webpage, 'upload date', fatal=False))
+ duration = parse_duration(self._search_regex(
+ r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"',
+ webpage, 'duration', fatal=False))
+ age_limit = self._family_friendly_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py
index 846b76c81..d6054d717 100644
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -1,17 +1,19 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import ExtractorError
class RedTubeIE(InfoExtractor):
_VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.redtube.com/66418',
+ 'md5': '7b8c22b5e7098a3e1c09709df1126d2d',
'info_dict': {
'id': '66418',
'ext': 'mp4',
- "title": "Sucked on a toilet",
- "age_limit": 18,
+ 'title': 'Sucked on a toilet',
+ 'age_limit': 18,
}
}
@@ -19,6 +21,9 @@ class RedTubeIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
+ raise ExtractorError('Video %s has been removed' % video_id, expected=True)
+
video_url = self._html_search_regex(
r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
video_title = self._html_search_regex(
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
index dce64e151..e42b319a3 100644
--- a/youtube_dl/extractor/rtbf.py
+++ b/youtube_dl/extractor/rtbf.py
@@ -1,49 +1,70 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unescapeHTML,
+)
class RTBFIE(InfoExtractor):
- _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P<id>\d+)'
+ _TESTS = [{
'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
'md5': '799f334ddf2c0a582ba80c44655be570',
'info_dict': {
'id': '1921274',
'ext': 'mp4',
'title': 'Les Diables au coeur (épisode 2)',
- 'description': 'Football - Diables Rouges',
'duration': 3099,
- 'timestamp': 1398456336,
- 'upload_date': '20140425',
}
- }
+ }, {
+ # geo restricted
+ 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858',
+ 'only_matching': True,
+ }]
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ _QUALITIES = [
+ ('mobile', 'mobile'),
+ ('web', 'SD'),
+ ('url', 'MD'),
+ ('high', 'HD'),
+ ]
- page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
- data = json.loads(self._html_search_regex(
- r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+ webpage = self._download_webpage(
+ 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
- video_url = data.get('downloadUrl') or data.get('url')
+ data = self._parse_json(
+ unescapeHTML(self._search_regex(
+ r'data-media="([^"]+)"', webpage, 'data video')),
+ video_id)
- if data['provider'].lower() == 'youtube':
+ if data.get('provider').lower() == 'youtube':
+ video_url = data.get('downloadUrl') or data.get('url')
return self.url_result(video_url, 'Youtube')
+ formats = []
+ for key, format_id in self._QUALITIES:
+ format_url = data['sources'].get(key)
+ if format_url:
+ formats.append({
+ 'format_id': format_id,
+ 'url': format_url,
+ })
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': data['title'],
'description': data.get('description') or data.get('subtitle'),
- 'thumbnail': data['thumbnail']['large'],
+ 'thumbnail': data.get('thumbnail'),
'duration': data.get('duration') or data.get('realDuration'),
- 'timestamp': data['created'],
- 'view_count': data['viewCount'],
+ 'timestamp': int_or_none(data.get('created')),
+ 'view_count': int_or_none(data.get('viewCount')),
}
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py
index 04158b993..d9cfbf180 100644
--- a/youtube_dl/extractor/rte.py
+++ b/youtube_dl/extractor/rte.py
@@ -9,16 +9,16 @@ from ..utils import (
class RteIE(InfoExtractor):
- _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://www.rte.ie/player/de/show/10363114/',
+ 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/',
'info_dict': {
- 'id': '10363114',
+ 'id': '10478715',
'ext': 'mp4',
- 'title': 'One News',
+ 'title': 'Watch iWitness online',
'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'The One O\'Clock News followed by Weather.',
- 'duration': 436.844,
+ 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.',
+ 'duration': 60.046,
},
'params': {
'skip_download': 'f4m fails with --test atm'
diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py
index 72cd80498..25f7faf76 100644
--- a/youtube_dl/extractor/rtl2.py
+++ b/youtube_dl/extractor/rtl2.py
@@ -1,6 +1,7 @@
# encoding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
@@ -8,22 +9,28 @@ class RTL2IE(InfoExtractor):
_VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))'
_TESTS = [{
'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0',
- 'md5': 'bfcc179030535b08dc2b36b469b5adc7',
'info_dict': {
'id': 'folge-203-0',
'ext': 'f4v',
'title': 'GRIP sucht den Sommerkönig',
'description': 'Matthias, Det und Helge treten gegeneinander an.'
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/',
- 'md5': 'ffcd517d2805b57ce11a58a2980c2b02',
'info_dict': {
'id': '21040-anna-erwischt-alex',
'ext': 'mp4',
'title': 'Anna erwischt Alex!',
'description': 'Anna ist Alex\' Tochter bei Köln 50667.'
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -34,12 +41,18 @@ class RTL2IE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- vico_id = self._html_search_regex(
- r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id')
- vivi_id = self._html_search_regex(
- r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')
+ mobj = re.search(
+ r'<div[^>]+data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"',
+ webpage)
+ if mobj:
+ vico_id = mobj.group('vico_id')
+ vivi_id = mobj.group('vivi_id')
+ else:
+ vico_id = self._html_search_regex(
+ r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id')
+ vivi_id = self._html_search_regex(
+ r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')
info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id
- webpage = self._download_webpage(info_url, '')
info = self._download_json(info_url, video_id)
video_info = info['video']
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index a3ca79f2c..543d94417 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -1,16 +1,25 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import parse_duration
+from ..utils import (
+ int_or_none,
+ parse_duration,
+)
-class RtlXlIE(InfoExtractor):
- IE_NAME = 'rtlxl.nl'
- _VALID_URL = r'https?://(www\.)?rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'
+class RtlNlIE(InfoExtractor):
+ IE_NAME = 'rtl.nl'
+ IE_DESC = 'rtl.nl and rtlxl.nl'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?
+ (?:
+ rtlxl\.nl/\#!/[^/]+/|
+ rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
+ )
+ (?P<id>[0-9a-f-]+)'''
- _TEST = {
+ _TESTS = [{
'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
'md5': 'cc16baa36a6c169391f0764fa6b16654',
'info_dict': {
@@ -22,29 +31,72 @@ class RtlXlIE(InfoExtractor):
'upload_date': '20140814',
'duration': 576.880,
},
- }
+ }, {
+ 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
+ 'md5': 'dea7474214af1271d91ef332fb8be7ea',
+ 'info_dict': {
+ 'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed',
+ 'ext': 'mp4',
+ 'timestamp': 1424039400,
+ 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
+ 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
+ 'upload_date': '20150215',
+ 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
+ }
+ }, {
+ # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275)
+ 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
+ 'info_dict': {
+ 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
+ 'ext': 'mp4',
+ 'title': 'RTL Nieuws - Meer beelden van overval juwelier',
+ 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
+ 'timestamp': 1437233400,
+ 'upload_date': '20150718',
+ 'duration': 30.474,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # encrypted m3u8 streams, georestricted
+ 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- uuid = mobj.group('uuid')
-
+ uuid = self._match_id(url)
info = self._download_json(
- 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid,
+ 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,
uuid)
material = info['material'][0]
- episode_info = info['episodes'][0]
+ title = info['abstracts'][0]['name']
+ subtitle = material.get('title')
+ if subtitle:
+ title += ' - %s' % subtitle
+ description = material.get('synopsis')
- progname = info['abstracts'][0]['name']
- subtitle = material['title'] or info['episodes'][0]['name']
+ meta = info.get('meta', {})
- # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118)
- videopath = material['videopath'].replace('.f4m', '.m3u8')
- m3u8_url = 'http://manifest.us.rtl.nl' + videopath
+ # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv.
+ # To workaround this previously adaptive -> flash trick was used to obtain
+ # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118)
+ # and bypass georestrictions as well.
+ # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore
+ # unusable albeit can be fixed by simple string replacement (see
+ # https://github.com/rg3/youtube-dl/pull/6337)
+ # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted
+ # streams are used now.
+ videopath = material['videopath']
+ m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')
- video_urlpart = videopath.split('/flash/')[1][:-5]
+ video_urlpart = videopath.split('/adaptive/')[1][:-5]
PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
formats.extend([
@@ -58,14 +110,29 @@ class RtlXlIE(InfoExtractor):
'quality': 0,
}
])
-
self._sort_formats(formats)
+ thumbnails = []
+
+ for p in ('poster_base_url', '"thumb_base_url"'):
+ if not meta.get(p):
+ continue
+
+ thumbnails.append({
+ 'url': self._proto_relative_url(meta[p] + uuid),
+ 'width': int_or_none(self._search_regex(
+ r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)),
+ 'height': int_or_none(self._search_regex(
+ r'/sz=[0-9]+x([0-9]+)',
+ meta[p], 'thumbnail height', fatal=False))
+ })
+
return {
'id': uuid,
- 'title': '%s - %s' % (progname, subtitle),
+ 'title': title,
'formats': formats,
'timestamp': material['original_date'],
- 'description': episode_info['synopsis'],
+ 'description': description,
'duration': parse_duration(material.get('duration')),
+ 'thumbnails': thumbnails,
}
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
deleted file mode 100644
index fd93cc66f..000000000
--- a/youtube_dl/extractor/rtlnow.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- clean_html,
- unified_strdate,
- int_or_none,
-)
-
-
-class RTLnowIE(InfoExtractor):
- """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'''(?x)
- (?:https?://)?
- (?P<url>
- (?P<domain>
- rtl-now\.rtl\.de|
- rtl2now\.rtl2\.de|
- (?:www\.)?voxnow\.de|
- (?:www\.)?rtlnitronow\.de|
- (?:www\.)?superrtlnow\.de|
- (?:www\.)?n-tvnow\.de)
- /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
- (?:container_id|film_id)=(?P<video_id>[0-9]+)&
- player=1(?:&season=[0-9]+)?(?:&.*)?
- )'''
-
- _TESTS = [
- {
- 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
- 'info_dict': {
- 'id': '90419',
- 'ext': 'flv',
- 'title': 'Ahornallee - Folge 1 - Der Einzug',
- 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
- 'upload_date': '20070416',
- 'duration': 1685,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
- 'info_dict': {
- 'id': '69756',
- 'ext': 'flv',
- 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
- 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
- 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
- 'upload_date': '20120519',
- 'duration': 1245,
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
- 'info_dict': {
- 'id': '13883',
- 'ext': 'flv',
- 'title': 'Voxtours - Südafrika-Reporter II',
- 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
- 'upload_date': '20090627',
- 'duration': 1800,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
- 'info_dict': {
- 'id': '99205',
- 'ext': 'flv',
- 'title': 'Medicopter 117 - Angst!',
- 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin',
- 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
- 'upload_date': '20080928',
- 'duration': 2691,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5',
- 'info_dict': {
- 'id': '188729',
- 'ext': 'flv',
- 'upload_date': '20150204',
- 'description': 'md5:5e1ce23095e61a79c166d134b683cecc',
- 'title': 'Der Bachelor - Folge 4',
- }
- }, {
- 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
- 'only_matching': True,
- },
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_page_url = 'http://%s/' % mobj.group('domain')
- video_id = mobj.group('video_id')
-
- webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
-
- mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
- if mobj:
- raise ExtractorError(clean_html(mobj.group(1)), expected=True)
-
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage, default=None)
-
- upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
-
- mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
- duration = int(mobj.group('seconds')) if mobj else None
-
- playerdata_url = self._html_search_regex(
- r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
-
- playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
-
- videoinfo = playerdata.find('./playlist/videoinfo')
-
- formats = []
- for filename in videoinfo.findall('filename'):
- mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
- if mobj:
- fmt = {
- 'url': mobj.group('url'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': video_page_url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- mobj = re.search(r'.*/(?P<hoster>[^/]+)/videos/(?P<play_path>.+)\.f4m', filename.text)
- if mobj:
- fmt = {
- 'url': 'rtmpe://fmspay-fra2.rtl.de/' + mobj.group('hoster'),
- 'play_path': 'mp4:' + mobj.group('play_path'),
- 'page_url': url,
- 'player_url': video_page_url + 'includes/vodplayer.swf',
- }
- else:
- fmt = {
- 'url': filename.text,
- }
- fmt.update({
- 'width': int_or_none(filename.get('width')),
- 'height': int_or_none(filename.get('height')),
- 'vbr': int_or_none(filename.get('bitrate')),
- 'ext': 'flv',
- })
- formats.append(fmt)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py
index ecf4939cd..82b323cdd 100644
--- a/youtube_dl/extractor/rtp.py
+++ b/youtube_dl/extractor/rtp.py
@@ -18,6 +18,10 @@ class RTPIE(InfoExtractor):
'description': 'As paixões musicais de António Cartaxo e António Macedo',
'thumbnail': 're:^https?://.*\.jpg',
},
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
'only_matching': True,
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index d0981115d..12639f08b 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -19,7 +19,16 @@ from ..utils import (
class RTSIE(InfoExtractor):
IE_DESC = 'RTS.ch'
- _VALID_URL = r'https?://(?:www\.)?rts\.ch/(?:(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+))'
+ _VALID_URL = r'''(?x)
+ (?:
+ rts:(?P<rts_id>\d+)|
+ https?://
+ (?:www\.)?rts\.ch/
+ (?:
+ (?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html|
+ play/tv/[^/]+/video/(?P<display_id_new>.+?)\?id=(?P<id_new>[0-9]+)
+ )
+ )'''
_TESTS = [
{
@@ -123,6 +132,15 @@ class RTSIE(InfoExtractor):
},
},
{
+ # article with videos on rhs
+ 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html',
+ 'info_dict': {
+ 'id': '6693917',
+ 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse',
+ },
+ 'playlist_mincount': 5,
+ },
+ {
'url': 'http://www.rts.ch/play/tv/le-19h30/video/le-chantier-du-nouveau-parlement-vaudois-a-permis-une-trouvaille-historique?id=6348280',
'only_matching': True,
}
@@ -130,7 +148,7 @@ class RTSIE(InfoExtractor):
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
- video_id = m.group('id') or m.group('id_new')
+ video_id = m.group('rts_id') or m.group('id') or m.group('id_new')
display_id = m.group('display_id') or m.group('display_id_new')
def download_json(internal_id):
@@ -143,6 +161,15 @@ class RTSIE(InfoExtractor):
# video_id extracted out of URL is not always a real id
if 'video' not in all_info and 'audio' not in all_info:
page = self._download_webpage(url, display_id)
+
+ # article with videos on rhs
+ videos = re.findall(
+ r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:rts:video:(\d+)"',
+ page)
+ if videos:
+ entries = [self.url_result('rts:%s' % video_urn, 'RTS') for video_urn in videos]
+ return self.playlist_result(entries, video_id, self._og_search_title(page))
+
internal_id = self._html_search_regex(
r'<(?:video|audio) data-id="([0-9]+)"', page,
'internal video id')
@@ -190,6 +217,7 @@ class RTSIE(InfoExtractor):
'tbr': media['rate'] or extract_bitrate(media['url']),
} for media in info['media'] if media.get('rate')])
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 3469d9578..603d7bd00 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -7,13 +7,17 @@ import time
from .common import InfoExtractor
from ..utils import (
- struct_unpack,
+ ExtractorError,
+ float_or_none,
remove_end,
+ sanitized_Request,
+ std_headers,
+ struct_unpack,
)
def _decrypt_url(png):
- encrypted_data = base64.b64decode(png)
+ encrypted_data = base64.b64decode(png.encode('utf-8'))
text_index = encrypted_data.find(b'tEXt')
text_chunk = encrypted_data[text_index - 4:]
length = struct_unpack('!I', text_chunk[:4])[0]
@@ -66,6 +70,7 @@ class RTVEALaCartaIE(InfoExtractor):
'id': '2491869',
'ext': 'mp4',
'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
+ 'duration': 5024.566,
},
}, {
'note': 'Live stream',
@@ -81,27 +86,34 @@ class RTVEALaCartaIE(InfoExtractor):
'only_matching': True,
}]
+ def _real_initialize(self):
+ user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
+ manager_info = self._download_json(
+ 'http://www.rtve.es/odin/loki/' + user_agent_b64,
+ None, 'Fetching manager info')
+ self._manager = manager_info['manager']
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
info = self._download_json(
'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
video_id)['page']['items'][0]
- png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id
- png = self._download_webpage(png_url, video_id, 'Downloading url information')
+ if info['state'] == 'DESPU':
+ raise ExtractorError('The video is no longer available', expected=True)
+ png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id)
+ png_request = sanitized_Request(png_url)
+ png_request.add_header('Referer', url)
+ png = self._download_webpage(png_request, video_id, 'Downloading url information')
video_url = _decrypt_url(png)
if not video_url.endswith('.f4m'):
- auth_url = video_url.replace(
+ video_url = video_url.replace(
'resources/', 'auth/resources/'
).replace('.net.rtve', '.multimedia.cdn.rtve')
- video_path = self._download_webpage(
- auth_url, video_id, 'Getting video url')
- # Use mvod.akcdn instead of flash.akamaihd.multimedia.cdn to get
- # the right Content-Length header and the mp4 format
- video_url = (
- 'http://mvod.akcdn.rtve.es/{0}&v=2.6.8'
- '&fp=MAC%2016,0,0,296&r=MRUGG&g=OEOJWFXNFGCP'.format(video_path)
- )
+
+ subtitles = None
+ if info.get('sbtFile') is not None:
+ subtitles = self.extract_subtitles(video_id, info['sbtFile'])
return {
'id': video_id,
@@ -109,6 +121,57 @@ class RTVEALaCartaIE(InfoExtractor):
'url': video_url,
'thumbnail': info.get('image'),
'page_url': url,
+ 'subtitles': subtitles,
+ 'duration': float_or_none(info.get('duration'), scale=1000),
+ }
+
+ def _get_subtitles(self, video_id, sub_file):
+ subs = self._download_json(
+ sub_file + '.json', video_id,
+ 'Downloading subtitles info')['page']['items']
+ return dict(
+ (s['lang'], [{'ext': 'vtt', 'url': s['src']}])
+ for s in subs)
+
+
+class RTVEInfantilIE(InfoExtractor):
+ IE_NAME = 'rtve.es:infantil'
+ IE_DESC = 'RTVE infantil'
+ _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/'
+
+ _TESTS = [{
+ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/',
+ 'md5': '915319587b33720b8e0357caaa6617e6',
+ 'info_dict': {
+ 'id': '3040283',
+ 'ext': 'mp4',
+ 'title': 'Maneras de vivir',
+ 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG',
+ 'duration': 357.958,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ info = self._download_json(
+ 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
+ video_id)['page']['items'][0]
+
+ webpage = self._download_webpage(url, video_id)
+ vidplayer_id = self._search_regex(
+ r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
+
+ png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
+ png = self._download_webpage(png_url, video_id, 'Downloading url information')
+ video_url = _decrypt_url(png)
+
+ return {
+ 'id': video_id,
+ 'ext': 'mp4',
+ 'title': info['title'],
+ 'url': video_url,
+ 'thumbnail': info.get('image'),
+ 'duration': float_or_none(info.get('duration'), scale=1000),
}
diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py
new file mode 100644
index 000000000..7c9d4b0cd
--- /dev/null
+++ b/youtube_dl/extractor/rtvnh.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class RTVNHIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.rtvnh.nl/video/131946',
+ 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1',
+ 'info_dict': {
+ 'id': '131946',
+ 'ext': 'mp4',
+ 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw',
+ 'thumbnail': 're:^https?:.*\.jpg$'
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ meta = self._parse_json(self._download_webpage(
+ 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id)
+
+ status = meta.get('status')
+ if status != 200:
+ raise ExtractorError(
+ '%s returned error code %d' % (self.IE_NAME, status), expected=True)
+
+ formats = self._extract_smil_formats(
+ 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False)
+
+ for item in meta['source']['fb']:
+ if item.get('type') == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ item['file'], video_id, ext='mp4', entry_protocol='m3u8_native'))
+ elif item.get('type') == '':
+ formats.append({'url': item['file']})
+
+ return {
+ 'id': video_id,
+ 'title': meta['title'].strip(),
+ 'thumbnail': meta.get('image'),
+ 'formats': formats
+ }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 5b1c3577a..6b09550b0 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -9,7 +9,7 @@ from ..compat import (
compat_str,
)
from ..utils import (
- ExtractorError,
+ determine_ext,
unified_strdate,
)
@@ -30,6 +30,7 @@ class RutubeIE(InfoExtractor):
'uploader': 'NTDRussian',
'uploader_id': '29790',
'upload_date': '20131016',
+ 'age_limit': 0,
},
'params': {
# It requires ffmpeg (m3u8 download)
@@ -50,10 +51,25 @@ class RutubeIE(InfoExtractor):
'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
video_id, 'Downloading options JSON')
- m3u8_url = options['video_balancer'].get('m3u8')
- if m3u8_url is None:
- raise ExtractorError('Couldn\'t find m3u8 manifest url')
- formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+ formats = []
+ for format_id, format_url in options['video_balancer'].items():
+ ext = determine_ext(format_url)
+ if ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ elif ext == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
+ format_url, video_id, f4m_id=format_id, fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ })
+ self._sort_formats(formats)
return {
'id': video['id'],
@@ -73,9 +89,9 @@ class RutubeIE(InfoExtractor):
class RutubeEmbedIE(InfoExtractor):
IE_NAME = 'rutube:embed'
IE_DESC = 'Rutube embedded videos'
- _VALID_URL = 'https?://rutube\.ru/video/embed/(?P<id>[0-9]+)'
+ _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
'info_dict': {
'id': 'a10e53b86e8f349080f718582ce4c661',
@@ -89,7 +105,10 @@ class RutubeEmbedIE(InfoExtractor):
'params': {
'skip_download': 'Requires ffmpeg',
},
- }
+ }, {
+ 'url': 'http://rutube.ru/play/embed/8083783',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
embed_id = self._match_id(url)
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
index ef766237b..d9df06861 100644
--- a/youtube_dl/extractor/rutv.py
+++ b/youtube_dl/extractor/rutv.py
@@ -84,18 +84,27 @@ class RUTVIE(InfoExtractor):
'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
},
+ 'skip': 'Translation has finished',
+ },
+ {
+ 'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/',
+ 'info_dict': {
+ 'id': '21',
+ 'ext': 'mp4',
+ 'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'is_live': True,
+ },
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
},
- 'skip': 'Translation has finished',
},
]
@classmethod
def _extract_url(cls, webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
if mobj:
return mobj.group('url')
@@ -119,8 +128,10 @@ class RUTVIE(InfoExtractor):
elif video_path.startswith('index/iframe/cast_id'):
video_type = 'live'
+ is_live = video_type == 'live'
+
json_data = self._download_json(
- 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
+ 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if is_live else '', video_id),
video_id, 'Downloading JSON')
if json_data['errors']:
@@ -147,6 +158,7 @@ class RUTVIE(InfoExtractor):
for transport, links in media['sources'].items():
for quality, url in links.items():
+ preference = -1 if priority_transport == transport else -2
if transport == 'rtmp':
mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
if not mobj:
@@ -160,9 +172,11 @@ class RUTVIE(InfoExtractor):
'rtmp_live': True,
'ext': 'flv',
'vbr': int(quality),
+ 'preference': preference,
}
elif transport == 'm3u8':
- formats.extend(self._extract_m3u8_formats(url, video_id, 'mp4'))
+ formats.extend(self._extract_m3u8_formats(
+ url, video_id, 'mp4', preference=preference, m3u8_id='hls'))
continue
else:
fmt = {
@@ -172,21 +186,18 @@ class RUTVIE(InfoExtractor):
'width': width,
'height': height,
'format_id': '%s-%s' % (transport, quality),
- 'preference': -1 if priority_transport == transport else -2,
})
formats.append(fmt)
- if not formats:
- raise ExtractorError('No media links available for %s' % video_id)
-
self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if is_live else title,
'description': description,
'thumbnail': thumbnail,
'view_count': view_count,
'duration': duration,
'formats': formats,
+ 'is_live': is_live,
}
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py
new file mode 100644
index 000000000..e417bf661
--- /dev/null
+++ b/youtube_dl/extractor/ruutu.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ xpath_attr,
+ xpath_text,
+)
+
+
+class RuutuIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ruutu\.fi/video/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.ruutu.fi/video/2058907',
+ 'md5': 'ab2093f39be1ca8581963451b3c0234f',
+ 'info_dict': {
+ 'id': '2058907',
+ 'ext': 'mp4',
+ 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',
+ 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 114,
+ 'age_limit': 0,
+ },
+ },
+ {
+ 'url': 'http://www.ruutu.fi/video/2057306',
+ 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',
+ 'info_dict': {
+ 'id': '2057306',
+ 'ext': 'mp4',
+ 'title': 'Superpesis: katso koko kausi Ruudussa',
+ 'description': 'md5:da2736052fef3b2bd5e0005e63c25eac',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 40,
+ 'age_limit': 0,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video_xml = self._download_xml(
+ 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id)
+
+ formats = []
+ processed_urls = []
+
+ def extract_formats(node):
+ for child in node:
+ if child.tag.endswith('Files'):
+ extract_formats(child)
+ elif child.tag.endswith('File'):
+ video_url = child.text
+ if (not video_url or video_url in processed_urls or
+ any(p in video_url for p in ('NOT_USED', 'NOT-USED'))):
+ return
+ processed_urls.append(video_url)
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ elif ext == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
+ video_url, video_id, f4m_id='hds', fatal=False)
+ if f4m_formats:
+ formats.extend(f4m_formats)
+ else:
+ proto = compat_urllib_parse_urlparse(video_url).scheme
+ if not child.tag.startswith('HTTP') and proto != 'rtmp':
+ continue
+ preference = -1 if proto == 'rtmp' else 1
+ label = child.get('label')
+ tbr = int_or_none(child.get('bitrate'))
+ width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]]
+ formats.append({
+ 'format_id': '%s-%s' % (proto, label if label else tbr),
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'preference': preference,
+ })
+
+ extract_formats(video_xml.find('./Clip'))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True),
+ 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'),
+ 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'),
+ 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+ 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py
new file mode 100644
index 000000000..919704261
--- /dev/null
+++ b/youtube_dl/extractor/safari.py
@@ -0,0 +1,156 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveLegacyIE
+
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ sanitized_Request,
+ smuggle_url,
+ std_headers,
+)
+
+
+class SafariBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/'
+ _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
+ _NETRC_MACHINE = 'safari'
+
+ _API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
+ _API_FORMAT = 'json'
+
+ LOGGED_IN = False
+
+ def _real_initialize(self):
+ # We only need to log in once for courses or individual videos
+ if not self.LOGGED_IN:
+ self._login()
+ SafariBaseIE.LOGGED_IN = True
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ self.raise_login_required('safaribooksonline.com account is required')
+
+ headers = std_headers
+ if 'Referer' not in headers:
+ headers['Referer'] = self._LOGIN_URL
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None,
+ 'Downloading login form')
+
+ csrf = self._html_search_regex(
+ r"name='csrfmiddlewaretoken'\s+value='([^']+)'",
+ login_page, 'csrf token')
+
+ login_form = {
+ 'csrfmiddlewaretoken': csrf,
+ 'email': username,
+ 'password1': password,
+ 'login': 'Sign In',
+ 'next': '',
+ }
+
+ request = sanitized_Request(
+ self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers)
+ login_page = self._download_webpage(
+ request, None, 'Logging in as %s' % username)
+
+ if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
+ raise ExtractorError(
+ 'Login failed; make sure your credentials are correct and try again.',
+ expected=True)
+
+ self.to_screen('Login successful')
+
+
+class SafariIE(SafariBaseIE):
+ IE_NAME = 'safari'
+ IE_DESC = 'safaribooksonline.com online video'
+ _VALID_URL = r'''(?x)https?://
+ (?:www\.)?safaribooksonline\.com/
+ (?:
+ library/view/[^/]+|
+ api/v1/book
+ )/
+ (?P<course_id>[^/]+)/
+ (?:chapter(?:-content)?/)?
+ (?P<part>part\d+)\.html
+ '''
+
+ _TESTS = [{
+ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
+ 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592',
+ 'info_dict': {
+ 'id': '2842601850001',
+ 'ext': 'mp4',
+ 'title': 'Introduction',
+ },
+ 'skip': 'Requires safaribooksonline account credentials',
+ }, {
+ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
+ 'only_matching': True,
+ }, {
+ # non-digits in course id
+ 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_id = mobj.group('course_id')
+ part = mobj.group('part')
+
+ webpage = self._download_webpage(
+ '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
+ part)
+
+ bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
+ if not bc_url:
+ raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
+
+ return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')
+
+
+class SafariCourseIE(SafariBaseIE):
+ IE_NAME = 'safari:course'
+ IE_DESC = 'safaribooksonline.com online courses'
+
+ _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>[^/]+)/?(?:[#?]|$)'
+
+ _TESTS = [{
+ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
+ 'info_dict': {
+ 'id': '9780133392838',
+ 'title': 'Hadoop Fundamentals LiveLessons',
+ },
+ 'playlist_count': 22,
+ 'skip': 'Requires safaribooksonline account credentials',
+ }, {
+ 'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ course_json = self._download_json(
+ '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+ course_id, 'Downloading course JSON')
+
+ if 'chapters' not in course_json:
+ raise ExtractorError(
+ 'No chapters found for course %s' % course_id, expected=True)
+
+ entries = [
+ self.url_result(chapter, 'Safari')
+ for chapter in course_json['chapters']]
+
+ course_title = course_json['title']
+
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py
new file mode 100644
index 000000000..759898a49
--- /dev/null
+++ b/youtube_dl/extractor/sandia.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ mimetype2ext,
+ sanitized_Request,
+ unified_strdate,
+)
+
+
+class SandiaIE(InfoExtractor):
+ IE_DESC = 'Sandia National Laboratories'
+ _VALID_URL = r'https?://digitalops\.sandia\.gov/Mediasite/Play/(?P<id>[0-9a-f]+)'
+ _TEST = {
+ 'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d',
+ 'md5': '9422edc9b9a60151727e4b6d8bef393d',
+ 'info_dict': {
+ 'id': '24aace4429fc450fb5b38cdbf424a66e1d',
+ 'ext': 'mp4',
+ 'title': 'Xyce Software Training - Section 1',
+ 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}',
+ 'upload_date': '20120904',
+ 'duration': 7794,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ req = sanitized_Request(url)
+ req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4')
+ webpage = self._download_webpage(req, video_id)
+
+ js_path = self._search_regex(
+ r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"',
+ webpage, 'JS code URL')
+ js_url = compat_urlparse.urljoin(url, js_path)
+
+ js_code = self._download_webpage(
+ js_url, video_id, note='Downloading player')
+
+ def extract_str(key, **args):
+ return self._search_regex(
+ r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key),
+ js_code, key, **args)
+
+ def extract_data(key, **args):
+ data_json = extract_str(key, **args)
+ if data_json is None:
+ return data_json
+ return self._parse_json(
+ data_json, video_id, transform_source=js_to_json)
+
+ formats = []
+ for i in itertools.count():
+ fd = extract_data('VideoUrls[%d]' % i, default=None)
+ if fd is None:
+ break
+ formats.append({
+ 'format_id': '%s' % i,
+ 'format_note': fd['MimeType'].partition('/')[2],
+ 'ext': mimetype2ext(fd['MimeType']),
+ 'url': fd['Location'],
+ 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
+ })
+ self._sort_formats(formats)
+
+ slide_baseurl = compat_urlparse.urljoin(
+ url, extract_data('SlideBaseUrl'))
+ slide_template = slide_baseurl + re.sub(
+ r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate'))
+ slides = []
+ last_slide_time = 0
+ for i in itertools.count(1):
+ sd = extract_str('Slides[%d]' % i, default=None)
+ if sd is None:
+ break
+ timestamp = int_or_none(self._search_regex(
+ r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),',
+ sd, 'slide %s timestamp' % i, fatal=False))
+ slides.append({
+ 'url': slide_template % i,
+ 'duration': timestamp - last_slide_time,
+ })
+ last_slide_time = timestamp
+ formats.append({
+ 'format_id': 'slides',
+ 'protocol': 'slideshow',
+ 'url': json.dumps(slides),
+ 'preference': -10000, # Downloader not yet written
+ })
+ self._sort_formats(formats)
+
+ title = extract_data('Title')
+ description = extract_data('Description', fatal=False)
+ duration = int_or_none(extract_data(
+ 'Duration', fatal=False), scale=1000)
+ upload_date = unified_strdate(extract_data('AirDate', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py
index b8775c2f9..d6ee2d9e2 100644
--- a/youtube_dl/extractor/sbs.py
+++ b/youtube_dl/extractor/sbs.py
@@ -1,18 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import json
-import re
from .common import InfoExtractor
-from ..utils import (
- js_to_json,
- remove_end,
-)
class SBSIE(InfoExtractor):
IE_DESC = 'sbs.com.au'
- _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)'
_TESTS = [{
# Original URL is handled by the generic IE which finds the iframe:
@@ -22,38 +16,36 @@ class SBSIE(InfoExtractor):
'info_dict': {
'id': '320403011771',
'ext': 'mp4',
- 'title': 'Dingo Conservation',
- 'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction',
+ 'title': 'Dingo Conservation (The Feed)',
+ 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
'thumbnail': 're:http://.*\.jpg',
+ 'duration': 308,
},
- 'add_ies': ['generic'],
}, {
'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
'only_matching': True,
+ }, {
+ 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
+ video_id = self._match_id(url)
- release_urls_json = js_to_json(self._search_regex(
- r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n',
- webpage, ''))
- release_urls = json.loads(release_urls_json)
- theplatform_url = (
- release_urls.get('progressive') or release_urls.get('standard'))
+ webpage = self._download_webpage(
+ 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id)
- title = remove_end(self._og_search_title(webpage), ' (The Feed)')
- description = self._html_search_meta('description', webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+ player_params = self._parse_json(
+ self._search_regex(
+ r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'),
+ video_id)
+
+ urls = player_params['releaseUrls']
+ theplatform_url = (urls.get('progressive') or urls.get('standard') or
+ urls.get('html') or player_params['relatedItemsURL'])
return {
'_type': 'url_transparent',
'id': video_id,
'url': theplatform_url,
-
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py
index 6c9fdb7c1..05f93904c 100644
--- a/youtube_dl/extractor/screenwavemedia.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -7,12 +7,13 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
+ js_to_json,
)
class ScreenwaveMediaIE(InfoExtractor):
- _VALID_URL = r'http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)'
-
+ _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)'
+ EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1'
_TESTS = [{
'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
'only_matching': True,
@@ -20,58 +21,73 @@ class ScreenwaveMediaIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- playerdata = self._download_webpage(url, video_id, 'Downloading player webpage')
+
+ playerdata = self._download_webpage(
+ 'http://player.screenwavemedia.com/player.php?id=%s' % video_id,
+ video_id, 'Downloading player webpage')
vidtitle = self._search_regex(
r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/')
- vidurl = self._search_regex(
- r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/')
-
- videolist_url = None
-
- mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata)
- if mobj:
- videoserver = mobj.group('videoserver')
- mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata)
- vidid = mobj.group('vidid') if mobj else video_id
- videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid)
- else:
- mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata)
- if mobj:
- videolist_url = mobj.group('smil')
-
- if videolist_url:
- videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
- formats = []
- baseurl = vidurl[:vidurl.rfind('/') + 1]
- for video in videolist.findall('.//video'):
- src = video.get('src')
- if not src:
+
+ playerconfig = self._download_webpage(
+ 'http://player.screenwavemedia.com/player.js',
+ video_id, 'Downloading playerconfig webpage')
+
+ videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver')
+
+ sources = self._parse_json(
+ js_to_json(
+ re.sub(
+ r'(?s)/\*.*?\*/', '',
+ self._search_regex(
+ r"sources\s*:\s*(\[[^\]]+?\])", playerconfig,
+ 'sources',
+ ).replace(
+ "' + thisObj.options.videoserver + '",
+ videoserver
+ ).replace(
+ "' + playerVidId + '",
+ video_id
+ )
+ )
+ ),
+ video_id, fatal=False
+ )
+
+ # Fallback to hardcoded sources if JS changes again
+ if not sources:
+ self.report_warning('Falling back to a hardcoded list of streams')
+ sources = [{
+ 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id),
+ 'type': 'mp4',
+ 'label': format_label,
+ } for format_id, format_label in (
+ ('low', '144p Low'), ('med', '160p Med'), ('high', '360p High'), ('hd1', '720p HD1'))]
+ sources.append({
+ 'file': 'http://%s/vod/smil:%s.smil/playlist.m3u8' % (videoserver, video_id),
+ 'type': 'hls',
+ })
+
+ formats = []
+ for source in sources:
+ if source['type'] == 'hls':
+ formats.extend(self._extract_m3u8_formats(source['file'], video_id))
+ else:
+ file_ = source.get('file')
+ if not file_:
continue
- file_ = src.partition(':')[-1]
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- bitrate = int_or_none(video.get('system-bitrate'), scale=1000)
- format = {
- 'url': baseurl + file_,
- 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
- }
- if width or height:
- format.update({
- 'tbr': bitrate,
- 'width': width,
- 'height': height,
- })
- else:
- format.update({
- 'abr': bitrate,
- 'vcodec': 'none',
- })
- formats.append(format)
- else:
- formats = [{
- 'url': vidurl,
- }]
+ format_label = source.get('label')
+ format_id = self._search_regex(
+ r'_(.+?)\.[^.]+$', file_, 'format id', default=None)
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_label, 'height', default=None))
+ formats.append({
+ 'url': source['file'],
+ 'format_id': format_id,
+ 'format': format_label,
+ 'ext': source.get('type'),
+ 'height': height,
+ })
self._sort_formats(formats)
return {
@@ -81,60 +97,6 @@ class ScreenwaveMediaIE(InfoExtractor):
}
-class CinemassacreIE(InfoExtractor):
- _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
- _TESTS = [
- {
- 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
- 'md5': 'fde81fbafaee331785f58cd6c0d46190',
- 'info_dict': {
- 'id': 'Cinemassacre-19911',
- 'ext': 'mp4',
- 'upload_date': '20121110',
- 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
- 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
- },
- },
- {
- 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
- 'md5': 'd72f10cd39eac4215048f62ab477a511',
- 'info_dict': {
- 'id': 'Cinemassacre-521be8ef82b16',
- 'ext': 'mp4',
- 'upload_date': '20131002',
- 'title': 'The Mummy’s Hand (1940)',
- },
- }
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
- video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
-
- webpage = self._download_webpage(url, display_id)
-
- playerdata_url = self._search_regex(
- r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
- webpage, 'player data URL')
- video_title = self._html_search_regex(
- r'<title>(?P<title>.+?)\|', webpage, 'title')
- video_description = self._html_search_regex(
- r'<div class="entry-content">(?P<description>.+?)</div>',
- webpage, 'description', flags=re.DOTALL, fatal=False)
- video_thumbnail = self._og_search_thumbnail(webpage)
-
- return {
- '_type': 'url_transparent',
- 'display_id': display_id,
- 'title': video_title,
- 'description': video_description,
- 'upload_date': video_date,
- 'thumbnail': video_thumbnail,
- 'url': playerdata_url,
- }
-
-
class TeamFourIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/video/(?P<id>[a-z0-9\-]+)/?'
_TEST = {
@@ -153,7 +115,7 @@ class TeamFourIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
playerdata_url = self._search_regex(
- r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
+ r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"',
webpage, 'player data URL')
video_title = self._html_search_regex(
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
new file mode 100644
index 000000000..474ebb49b
--- /dev/null
+++ b/youtube_dl/extractor/senateisvp.py
@@ -0,0 +1,145 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unsmuggle_url,
+)
+from ..compat import (
+ compat_parse_qs,
+ compat_urlparse,
+)
+
+
+class SenateISVPIE(InfoExtractor):
+ _COMM_MAP = [
+ ["ag", "76440", "http://ag-f.akamaihd.net"],
+ ["aging", "76442", "http://aging-f.akamaihd.net"],
+ ["approps", "76441", "http://approps-f.akamaihd.net"],
+ ["armed", "76445", "http://armed-f.akamaihd.net"],
+ ["banking", "76446", "http://banking-f.akamaihd.net"],
+ ["budget", "76447", "http://budget-f.akamaihd.net"],
+ ["cecc", "76486", "http://srs-f.akamaihd.net"],
+ ["commerce", "80177", "http://commerce1-f.akamaihd.net"],
+ ["csce", "75229", "http://srs-f.akamaihd.net"],
+ ["dpc", "76590", "http://dpc-f.akamaihd.net"],
+ ["energy", "76448", "http://energy-f.akamaihd.net"],
+ ["epw", "76478", "http://epw-f.akamaihd.net"],
+ ["ethics", "76449", "http://ethics-f.akamaihd.net"],
+ ["finance", "76450", "http://finance-f.akamaihd.net"],
+ ["foreign", "76451", "http://foreign-f.akamaihd.net"],
+ ["govtaff", "76453", "http://govtaff-f.akamaihd.net"],
+ ["help", "76452", "http://help-f.akamaihd.net"],
+ ["indian", "76455", "http://indian-f.akamaihd.net"],
+ ["intel", "76456", "http://intel-f.akamaihd.net"],
+ ["intlnarc", "76457", "http://intlnarc-f.akamaihd.net"],
+ ["jccic", "85180", "http://jccic-f.akamaihd.net"],
+ ["jec", "76458", "http://jec-f.akamaihd.net"],
+ ["judiciary", "76459", "http://judiciary-f.akamaihd.net"],
+ ["rpc", "76591", "http://rpc-f.akamaihd.net"],
+ ["rules", "76460", "http://rules-f.akamaihd.net"],
+ ["saa", "76489", "http://srs-f.akamaihd.net"],
+ ["smbiz", "76461", "http://smbiz-f.akamaihd.net"],
+ ["srs", "75229", "http://srs-f.akamaihd.net"],
+ ["uscc", "76487", "http://srs-f.akamaihd.net"],
+ ["vetaff", "76462", "http://vetaff-f.akamaihd.net"],
+ ["arch", "", "http://ussenate-f.akamaihd.net/"]
+ ]
+ _IE_NAME = 'senate.gov'
+ _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
+ _TESTS = [{
+ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
+ 'info_dict': {
+ 'id': 'judiciary031715',
+ 'ext': 'flv',
+ 'title': 'Integrated Senate Video Player',
+ 'thumbnail': 're:^https?://.*\.(?:jpg|png)$',
+ }
+ }, {
+ 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
+ 'info_dict': {
+ 'id': 'commerce011514',
+ 'ext': 'flv',
+ 'title': 'Integrated Senate Video Player'
+ }
+ }, {
+ 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
+ # checksum differs each time
+ 'info_dict': {
+ 'id': 'intel090613',
+ 'ext': 'mp4',
+ 'title': 'Integrated Senate Video Player'
+ }
+ }, {
+ # From http://www.c-span.org/video/?96791-1
+ 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _search_iframe_url(webpage):
+ mobj = re.search(
+ r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _get_info_for_comm(self, committee):
+ for entry in self._COMM_MAP:
+ if entry[0] == committee:
+ return entry[1:]
+
+ def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
+ qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
+ if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ video_id = re.sub(r'.mp4$', '', qs['filename'][0])
+
+ webpage = self._download_webpage(url, video_id)
+
+ if smuggled_data.get('force_title'):
+ title = smuggled_data['force_title']
+ else:
+ title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
+ poster = qs.get('poster')
+ thumbnail = poster[0] if poster else None
+
+ video_type = qs['type'][0]
+ committee = video_type if video_type == 'arch' else qs['comm'][0]
+ stream_num, domain = self._get_info_for_comm(committee)
+
+ formats = []
+ if video_type == 'arch':
+ filename = video_id if '.' in video_id else video_id + '.mp4'
+ formats = [{
+ # All parameters in the query string are necessary to prevent a 403 error
+ 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
+ }]
+ else:
+ hdcore_sign = 'hdcore=3.1.0'
+ url_params = (domain, video_id, stream_num)
+ f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign
+ m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
+ for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
+ # URLs without the extra param induce an 404 error
+ entry.update({'extra_param_to_segment_url': hdcore_sign})
+ formats.append(entry)
+ for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
+ mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
+ if mobj:
+ entry['format_id'] += mobj.group('tag')
+ formats.append(entry)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py
index 6446d26dc..e33483674 100644
--- a/youtube_dl/extractor/sexykarma.py
+++ b/youtube_dl/extractor/sexykarma.py
@@ -29,6 +29,7 @@ class SexyKarmaIE(InfoExtractor):
'view_count': int,
'comment_count': int,
'categories': list,
+ 'age_limit': 18,
}
}, {
'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html',
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
new file mode 100644
index 000000000..f76fb12c0
--- /dev/null
+++ b/youtube_dl/extractor/shahid.py
@@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class ShahidIE(InfoExtractor):
+ _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?'
+ _TESTS = [{
+ 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html',
+ 'info_dict': {
+ 'id': '90574',
+ 'ext': 'mp4',
+ 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3',
+ 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان',
+ 'duration': 2972,
+ 'timestamp': 1422057420,
+ 'upload_date': '20150123',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ # shahid plus subscriber only
+ 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html',
+ 'only_matching': True
+ }]
+
+ def _handle_error(self, response):
+ if not isinstance(response, dict):
+ return
+ error = response.get('error')
+ if error:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())),
+ expected=True)
+
+ def _download_json(self, url, video_id, note='Downloading JSON metadata'):
+ response = super(ShahidIE, self)._download_json(url, video_id, note)['data']
+ self._handle_error(response)
+ return response
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ api_vars = {
+ 'id': video_id,
+ 'type': 'player',
+ 'url': 'http://api.shahid.net/api/v1_1',
+ 'playerType': 'episode',
+ }
+
+ flashvars = self._search_regex(
+ r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None)
+ if flashvars:
+ for key in api_vars.keys():
+ value = self._search_regex(
+ r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key,
+ flashvars, 'type', default=None, group='value')
+ if value:
+ api_vars[key] = value
+
+ player = self._download_json(
+ 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html'
+ % (video_id, api_vars['type']), video_id, 'Downloading player JSON')
+
+ formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4')
+
+ video = self._download_json(
+ '%s/%s/%s?%s' % (
+ api_vars['url'], api_vars['playerType'], api_vars['id'],
+ compat_urllib_parse.urlencode({
+ 'apiKey': 'sh@hid0nlin3',
+ 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
+ })),
+ video_id, 'Downloading video JSON')
+
+ video = video[api_vars['playerType']]
+
+ title = video['title']
+ description = video.get('description')
+ thumbnail = video.get('thumbnailUrl')
+ duration = int_or_none(video.get('duration'))
+ timestamp = parse_iso8601(video.get('referenceDate'))
+ categories = [
+ category['name']
+ for category in video.get('genres', []) if 'name' in category]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'categories': categories,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index 26ced716e..8eda3c864 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -1,31 +1,39 @@
from __future__ import unicode_literals
-import re
import base64
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
int_or_none,
+ sanitized_Request,
)
class SharedIE(InfoExtractor):
- _VALID_URL = r'http://shared\.sx/(?P<id>[\da-z]{10})'
+ IE_DESC = 'shared.sx and vivo.sx'
+ _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})'
- _TEST = {
+ _TESTS = [{
'url': 'http://shared.sx/0060718775',
'md5': '106fefed92a8a2adb8c98e6a0652f49b',
'info_dict': {
'id': '0060718775',
'ext': 'mp4',
'title': 'Bmp4',
+ 'filesize': 1720110,
+ },
+ }, {
+ 'url': 'http://vivo.sx/d7ddda0e78',
+ 'md5': '15b3af41be0b4fe01f4df075c2678b2c',
+ 'info_dict': {
+ 'id': 'd7ddda0e78',
+ 'ext': 'mp4',
+ 'title': 'Chicken',
+ 'filesize': 528031,
},
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -35,9 +43,8 @@ class SharedIE(InfoExtractor):
raise ExtractorError(
'Video %s does not exist' % video_id, expected=True)
- download_form = dict(re.findall(
- r'<input type="hidden" name="([^"]+)" value="([^"]*)"', webpage))
- request = compat_urllib_request.Request(
+ download_form = self._hidden_inputs(webpage)
+ request = sanitized_Request(
url, compat_urllib_parse.urlencode(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -47,7 +54,7 @@ class SharedIE(InfoExtractor):
video_url = self._html_search_regex(
r'data-url="([^"]+)"', video_page, 'video URL')
title = base64.b64decode(self._html_search_meta(
- 'full:title', webpage, 'title')).decode('utf-8')
+ 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
filesize = int_or_none(self._html_search_meta(
'full:size', webpage, 'file size', fatal=False))
thumbnail = self._html_search_regex(
diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py
index ac3e3adf2..f1ea9bdb2 100644
--- a/youtube_dl/extractor/sharesix.py
+++ b/youtube_dl/extractor/sharesix.py
@@ -4,12 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
parse_duration,
+ sanitized_Request,
)
@@ -50,7 +48,7 @@ class ShareSixIE(InfoExtractor):
'method_free': 'Free'
}
post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
+ req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
webpage = self._download_webpage(req, video_id,
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py
index 0891a441f..b2258a0f6 100644
--- a/youtube_dl/extractor/sina.py
+++ b/youtube_dl/extractor/sina.py
@@ -4,10 +4,8 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse
+from ..utils import sanitized_Request
class SinaIE(InfoExtractor):
@@ -61,7 +59,7 @@ class SinaIE(InfoExtractor):
if mobj.group('token') is not None:
# The video id is in the redirected url
self.to_screen('Getting video id')
- request = compat_urllib_request.Request(url)
+ request = sanitized_Request(url)
request.get_method = lambda: 'HEAD'
(_, urlh) = self._download_webpage_handle(request, 'NA', False)
return self._real_extract(urlh.geturl())
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py
index 9f79ff5c1..0b717a1e4 100644
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor):
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex(
- r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=',
+ r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',
webpage, 'slideshare object')
info = json.loads(slideshare_obj)
if info['slideshow']['type'] != 'video':
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 24746a09a..30210c8a3 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -7,13 +7,11 @@ import hashlib
import uuid
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
int_or_none,
+ sanitized_Request,
unified_strdate,
)
@@ -53,7 +51,7 @@ class SmotriIE(InfoExtractor):
'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
},
},
- # video-password
+ # video-password, not approved by moderator
{
'url': 'http://smotri.com/video/view/?id=v1390466a13c',
'md5': 'f6331cef33cad65a0815ee482a54440b',
@@ -71,7 +69,24 @@ class SmotriIE(InfoExtractor):
},
'skip': 'Video is not approved by moderator',
},
- # age limit + video-password
+ # video-password
+ {
+ 'url': 'http://smotri.com/video/view/?id=v6984858774#',
+ 'md5': 'f11e01d13ac676370fc3b95b9bda11b0',
+ 'info_dict': {
+ 'id': 'v6984858774',
+ 'ext': 'mp4',
+ 'title': 'Дача Солженицина ПАРОЛЬ 223322',
+ 'uploader': 'psavari1',
+ 'uploader_id': 'psavari1',
+ 'upload_date': '20081103',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'videopassword': '223322',
+ },
+ },
+ # age limit + video-password, not approved by moderator
{
'url': 'http://smotri.com/video/view/?id=v15408898bcf',
'md5': '91e909c9f0521adf5ee86fbe073aad70',
@@ -90,19 +105,22 @@ class SmotriIE(InfoExtractor):
},
'skip': 'Video is not approved by moderator',
},
- # not approved by moderator, but available
+ # age limit + video-password
{
- 'url': 'http://smotri.com/video/view/?id=v28888533b73',
- 'md5': 'f44bc7adac90af518ef1ecf04893bb34',
+ 'url': 'http://smotri.com/video/view/?id=v7780025814',
+ 'md5': 'b4599b068422559374a59300c5337d72',
'info_dict': {
- 'id': 'v28888533b73',
+ 'id': 'v7780025814',
'ext': 'mp4',
- 'title': 'Russian Spies Killed By ISIL Child Soldier',
- 'uploader': 'Mopeder',
- 'uploader_id': 'mopeder',
- 'duration': 71,
- 'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg',
- 'upload_date': '20150114',
+ 'title': 'Sexy Beach (пароль 123)',
+ 'uploader': 'вАся',
+ 'uploader_id': 'asya_prosto',
+ 'upload_date': '20081218',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'videopassword': '123'
},
},
# swf player
@@ -152,7 +170,11 @@ class SmotriIE(InfoExtractor):
'getvideoinfo': '1',
}
- request = compat_urllib_request.Request(
+ video_password = self._downloader.params.get('videopassword', None)
+ if video_password:
+ video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
+
+ request = sanitized_Request(
'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
@@ -161,13 +183,18 @@ class SmotriIE(InfoExtractor):
video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
if not video_url:
- if video.get('_moderate_no') or not video.get('moderated'):
+ if video.get('_moderate_no'):
raise ExtractorError(
'Video %s has not been approved by moderator' % video_id, expected=True)
if video.get('error'):
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+ if video.get('_pass_protected') == 1:
+ msg = ('Invalid video password' if video_password
+ else 'This video is protected by a password, use the --video-password option')
+ raise ExtractorError(msg, expected=True)
+
title = video['title']
thumbnail = video['_imgURL']
upload_date = unified_strdate(video['added'])
@@ -301,10 +328,7 @@ class SmotriBroadcastIE(InfoExtractor):
(username, password) = self._get_login_info()
if username is None:
- raise ExtractorError(
- 'Erotic broadcasts allowed only for registered users, '
- 'use --username and --password options to provide account credentials.',
- expected=True)
+ self.raise_login_required('Erotic broadcasts allowed only for registered users')
login_form = {
'login-hint53': '1',
@@ -313,7 +337,7 @@ class SmotriBroadcastIE(InfoExtractor):
'password': password,
}
- request = compat_urllib_request.Request(
+ request = sanitized_Request(
broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
broadcast_page = self._download_webpage(
diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py
new file mode 100644
index 000000000..6977afb27
--- /dev/null
+++ b/youtube_dl/extractor/snagfilms.py
@@ -0,0 +1,181 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ clean_html,
+ determine_ext,
+ int_or_none,
+ js_to_json,
+ parse_duration,
+)
+
+
+class SnagFilmsEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
+ _TESTS = [{
+ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
+ 'md5': '2924e9215c6eff7a55ed35b72276bd93',
+ 'info_dict': {
+ 'id': '74849a00-85a9-11e1-9660-123139220831',
+ 'ext': 'mp4',
+ 'title': '#whilewewatch',
+ }
+ }, {
+ # invalid labels, 360p is better that 480p
+ 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
+ 'md5': '882fca19b9eb27ef865efeeaed376a48',
+ 'info_dict': {
+ 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
+ 'ext': 'mp4',
+ 'title': 'Life in Limbo',
+ }
+ }, {
+ 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ if '>This film is not playable in your area.<' in webpage:
+ raise ExtractorError(
+ 'Film %s is not playable in your area.' % video_id, expected=True)
+
+ formats = []
+ for source in self._parse_json(js_to_json(self._search_regex(
+ r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
+ file_ = source.get('file')
+ if not file_:
+ continue
+ type_ = source.get('type')
+ ext = determine_ext(file_)
+ format_id = source.get('label') or ext
+ if all(v == 'm3u8' for v in (type_, ext)):
+ formats.extend(self._extract_m3u8_formats(
+ file_, video_id, 'mp4', m3u8_id='hls'))
+ else:
+ bitrate = int_or_none(self._search_regex(
+ [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
+ file_, 'bitrate', default=None))
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ formats.append({
+ 'url': file_,
+ 'format_id': format_id,
+ 'tbr': bitrate,
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
+ webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ }
+
+
+class SnagFilmsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'http://www.snagfilms.com/films/title/lost_for_life',
+ 'md5': '19844f897b35af219773fd63bdec2942',
+ 'info_dict': {
+ 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
+ 'display_id': 'lost_for_life',
+ 'ext': 'mp4',
+ 'title': 'Lost for Life',
+ 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 4489,
+ 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
+ }
+ }, {
+ 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
+ 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
+ 'info_dict': {
+ 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
+ 'display_id': 'the_world_cut_project/india',
+ 'ext': 'mp4',
+ 'title': 'India',
+ 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 979,
+ 'categories': ['Documentary', 'Sports', 'Politics']
+ }
+ }, {
+ # Film is not playable in your area.
+ 'url': 'http://www.snagfilms.com/films/title/inside_mecca',
+ 'only_matching': True,
+ }, {
+ # Film is not available.
+ 'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ if ">Sorry, the Film you're looking for is not available.<" in webpage:
+ raise ExtractorError(
+ 'Film %s is not available.' % display_id, expected=True)
+
+ film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
+
+ snag = self._parse_json(
+ self._search_regex(
+ 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
+ display_id)
+
+ for item in snag:
+ if item.get('data', {}).get('film', {}).get('id') == film_id:
+ data = item['data']['film']
+ title = data['title']
+ description = clean_html(data.get('synopsis'))
+ thumbnail = data.get('image')
+ duration = int_or_none(data.get('duration') or data.get('runtime'))
+ categories = [
+ category['title'] for category in data.get('categories', [])
+ if category.get('title')]
+ break
+ else:
+ title = self._search_regex(
+ r'itemprop="title">([^<]+)<', webpage, 'title')
+ description = self._html_search_regex(
+ r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
+ webpage, 'description', default=None) or self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = parse_duration(self._search_regex(
+ r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
+ webpage, 'duration', fatal=False))
+ categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
+ 'id': film_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'categories': categories,
+ }
diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py
deleted file mode 100644
index 7d3c0e937..000000000
--- a/youtube_dl/extractor/sockshare.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- determine_ext,
- ExtractorError,
-)
-
-from .common import InfoExtractor
-
-
-class SockshareIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P<id>[0-9A-Za-z]+)'
- _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.</div>'
- _TEST = {
- 'url': 'http://www.sockshare.com/file/437BE28B89D799D7',
- 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd',
- 'info_dict': {
- 'id': '437BE28B89D799D7',
- 'title': 'big_buck_bunny_720p_surround.avi',
- 'ext': 'avi',
- 'thumbnail': 're:^http://.*\.jpg$',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = 'http://sockshare.com/file/%s' % video_id
- webpage = self._download_webpage(url, video_id)
-
- if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
- raise ExtractorError('Video %s does not exist' % video_id,
- expected=True)
-
- confirm_hash = self._html_search_regex(r'''(?x)<input\s+
- type="hidden"\s+
- value="([^"]*)"\s+
- name="hash"
- ''', webpage, 'hash')
-
- fields = {
- "hash": confirm_hash,
- "confirm": "Continue as Free User"
- }
-
- post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
- # Apparently, this header is required for confirmation to work.
- req.add_header('Host', 'www.sockshare.com')
- req.add_header('Content-type', 'application/x-www-form-urlencoded')
-
- webpage = self._download_webpage(
- req, video_id, 'Downloading video page')
-
- video_url = self._html_search_regex(
- r'<a href="([^"]*)".+class="download_file_link"',
- webpage, 'file url')
- video_url = "http://www.sockshare.com" + video_url
- title = self._html_search_regex((
- r'<h1>(.+)<strong>',
- r'var name = "([^"]+)";'),
- webpage, 'title', default=None)
- thumbnail = self._html_search_regex(
- r'<img\s+src="([^"]*)".+?name="bg"',
- webpage, 'thumbnail')
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'ext': determine_ext(title),
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index c04791997..daf6ad555 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -4,22 +4,89 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .common import compat_str
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+)
+from ..utils import (
+ ExtractorError,
+ sanitized_Request,
+)
class SohuIE(InfoExtractor):
_VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'
- _TEST = {
+ _TESTS = [{
+ 'note': 'This video is available only in Mainland China',
'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super',
- 'md5': 'bde8d9a6ffd82c63a1eefaef4eeefec7',
+ 'md5': '29175c8cadd8b5cc4055001e85d6b372',
'info_dict': {
'id': '382479172',
'ext': 'mp4',
'title': 'MV:Far East Movement《The Illest》',
},
- 'skip': 'Only available from China',
- }
+ 'skip': 'On available in China',
+ }, {
+ 'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
+ 'md5': '699060e75cf58858dd47fb9c03c42cfb',
+ 'info_dict': {
+ 'id': '409385080',
+ 'ext': 'mp4',
+ 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
+ }
+ }, {
+ 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
+ 'md5': '9bf34be48f2f4dadcb226c74127e203c',
+ 'info_dict': {
+ 'id': '78693464',
+ 'ext': 'mp4',
+ 'title': '【爱范品】第31期:MWC见不到的奇葩手机',
+ }
+ }, {
+ 'note': 'Multipart video',
+ 'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml',
+ 'info_dict': {
+ 'id': '78910339',
+ 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+ },
+ 'playlist': [{
+ 'md5': 'bdbfb8f39924725e6589c146bc1883ad',
+ 'info_dict': {
+ 'id': '78910339_part1',
+ 'ext': 'mp4',
+ 'duration': 294,
+ 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+ }
+ }, {
+ 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',
+ 'info_dict': {
+ 'id': '78910339_part2',
+ 'ext': 'mp4',
+ 'duration': 300,
+ 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+ }
+ }, {
+ 'md5': '8407e634175fdac706766481b9443450',
+ 'info_dict': {
+ 'id': '78910339_part3',
+ 'ext': 'mp4',
+ 'duration': 150,
+ 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+ }
+ }]
+ }, {
+ 'note': 'Video with title containing dash',
+ 'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml',
+ 'info_dict': {
+ 'id': '78932792',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl testing video',
+ },
+ 'params': {
+ 'skip_download': True
+ }
+ }]
def _real_extract(self, url):
@@ -29,8 +96,14 @@ class SohuIE(InfoExtractor):
else:
base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
+ req = sanitized_Request(base_data_url + vid_id)
+
+ cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+ if cn_verification_proxy:
+ req.add_header('Ytdl-request-proxy', cn_verification_proxy)
+
return self._download_json(
- base_data_url + vid_id, video_id,
+ req, video_id,
'Downloading JSON data for %s' % vid_id)
mobj = re.match(self._VALID_URL, url)
@@ -38,15 +111,22 @@ class SohuIE(InfoExtractor):
mytv = mobj.group('mytv') is not None
webpage = self._download_webpage(url, video_id)
- raw_title = self._html_search_regex(
- r'(?s)<title>(.+?)</title>',
- webpage, 'video title')
- title = raw_title.partition('-')[0].strip()
+
+ title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage))
vid = self._html_search_regex(
r'var vid ?= ?["\'](\d+)["\']',
webpage, 'video path')
vid_data = _fetch_data(vid, mytv)
+ if vid_data['play'] != 1:
+ if vid_data.get('status') == 12:
+ raise ExtractorError(
+ 'Sohu said: There\'s something wrong in the video.',
+ expected=True)
+ else:
+ raise ExtractorError(
+ 'Sohu said: The video is only licensed to users in Mainland China.',
+ expected=True)
formats_json = {}
for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
@@ -63,21 +143,41 @@ class SohuIE(InfoExtractor):
formats = []
for format_id, format_data in formats_json.items():
allot = format_data['allot']
- prot = format_data['prot']
data = format_data['data']
clips_url = data['clipsURL']
su = data['su']
- part_str = self._download_webpage(
- 'http://%s/?prot=%s&file=%s&new=%s' %
- (allot, prot, clips_url[i], su[i]),
- video_id,
- 'Downloading %s video URL part %d of %d'
- % (format_id, i + 1, part_count))
+ video_url = 'newflv.sohu.ccgslb.net'
+ cdnId = None
+ retries = 0
+
+ while 'newflv.sohu.ccgslb.net' in video_url:
+ params = {
+ 'prot': 9,
+ 'file': clips_url[i],
+ 'new': su[i],
+ 'prod': 'flash',
+ }
- part_info = part_str.split('|')
- video_url = '%s%s?key=%s' % (part_info[0], su[i], part_info[3])
+ if cdnId is not None:
+ params['idc'] = cdnId
+
+ download_note = 'Downloading %s video URL part %d of %d' % (
+ format_id, i + 1, part_count)
+
+ if retries > 0:
+ download_note += ' (retry #%d)' % retries
+ part_info = self._parse_json(self._download_webpage(
+ 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
+ video_id, download_note), video_id)
+
+ video_url = part_info['url']
+ cdnId = part_info.get('nid')
+
+ retries += 1
+ if retries > 5:
+ raise ExtractorError('Failed to get video URL')
formats.append({
'url': video_url,
@@ -101,9 +201,10 @@ class SohuIE(InfoExtractor):
info['id'] = video_id
else:
info = {
- '_type': 'playlist',
+ '_type': 'multi_video',
'entries': playlist,
'id': video_id,
+ 'title': title,
}
return info
diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py
new file mode 100644
index 000000000..5da66ca9e
--- /dev/null
+++ b/youtube_dl/extractor/soompi.py
@@ -0,0 +1,146 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .crunchyroll import CrunchyrollIE
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ remove_start,
+ xpath_text,
+)
+
+
+class SoompiBaseIE(InfoExtractor):
+ def _get_episodes(self, webpage, episode_filter=None):
+ episodes = self._parse_json(
+ self._search_regex(
+ r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'),
+ None)
+ return list(filter(episode_filter, episodes))
+
+
+class SoompiIE(SoompiBaseIE, CrunchyrollIE):
+ IE_NAME = 'soompi'
+ _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/watch/29235',
+ 'info_dict': {
+ 'id': '29235',
+ 'ext': 'mp4',
+ 'title': 'Episode 1096',
+ 'description': '2015-05-20'
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _get_episode(self, webpage, video_id):
+ return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0]
+
+ def _get_subtitles(self, config, video_id):
+ sub_langs = {}
+ for subtitle in config.findall('./{default}preload/subtitles/subtitle'):
+ sub_langs[subtitle.attrib['id']] = subtitle.attrib['title']
+
+ subtitles = {}
+ for s in config.findall('./{default}preload/subtitle'):
+ lang_code = sub_langs.get(s.attrib['id'])
+ if not lang_code:
+ continue
+ sub_id = s.get('id')
+ data = xpath_text(s, './data', 'data')
+ iv = xpath_text(s, './iv', 'iv')
+ if not id or not iv or not data:
+ continue
+ subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8')
+ subtitles[lang_code] = self._extract_subtitles(subtitle)
+ return subtitles
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ try:
+ webpage = self._download_webpage(
+ url, video_id, 'Downloading episode page')
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ webpage = ee.cause.read()
+ block_message = self._html_search_regex(
+ r'(?s)<div class="block-message">(.+?)</div>', webpage,
+ 'block message', default=None)
+ if block_message:
+ raise ExtractorError(block_message, expected=True)
+ raise
+
+ formats = []
+ config = None
+ for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage):
+ config = self._download_xml(
+ 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id),
+ video_id, 'Downloading %s XML' % format_id)
+ m3u8_url = xpath_text(
+ config, './{default}preload/stream_info/file',
+ '%s m3u8 URL' % format_id)
+ if not m3u8_url:
+ continue
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', m3u8_id=format_id))
+ self._sort_formats(formats)
+
+ episode = self._get_episode(webpage, video_id)
+
+ title = episode['name']
+ description = episode.get('description')
+ duration = int_or_none(episode.get('duration'))
+
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()]
+
+ subtitles = self.extract_subtitles(config, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class SoompiShowIE(SoompiBaseIE):
+ IE_NAME = 'soompi:show'
+ _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)'
+ _TESTS = [{
+ 'url': 'http://tv.soompi.com/en/shows/liar-game',
+ 'info_dict': {
+ 'id': 'liar-game',
+ 'title': 'Liar Game',
+ 'description': 'md5:52c02bce0c1a622a95823591d0589b66',
+ },
+ 'playlist_count': 14,
+ }]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ url, show_id, 'Downloading show page')
+
+ title = remove_start(self._og_search_title(webpage), 'SoompiTV | ')
+ description = self._og_search_description(webpage)
+
+ entries = [
+ self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi')
+ for episode in self._get_episodes(webpage)]
+
+ return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index c5284fa67..02e64e094 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -4,13 +4,17 @@ from __future__ import unicode_literals
import re
import itertools
-from .common import InfoExtractor
+from .common import (
+ InfoExtractor,
+ SearchInfoExtractor
+)
from ..compat import (
compat_str,
compat_urlparse,
compat_urllib_parse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
int_or_none,
unified_strdate,
@@ -29,7 +33,7 @@ class SoundcloudIE(InfoExtractor):
_VALID_URL = r'''(?x)^(?:https?://)?
(?:(?:(?:www\.|m\.)?soundcloud\.com/
(?P<uploader>[\w\d-]+)/
- (?!sets/|likes/?(?:$|[?#]))
+ (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
|(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
@@ -113,7 +117,7 @@ class SoundcloudIE(InfoExtractor):
},
]
- _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+ _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea'
_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
def report_resolve(self, video_id):
@@ -180,7 +184,7 @@ class SoundcloudIE(InfoExtractor):
'format_id': key,
'url': url,
'play_path': 'mp3:' + path,
- 'ext': ext,
+ 'ext': 'flv',
'vcodec': 'none',
})
@@ -200,8 +204,9 @@ class SoundcloudIE(InfoExtractor):
if f['format_id'].startswith('rtmp'):
f['protocol'] = 'rtmp'
- self._sort_formats(formats)
- result['formats'] = formats
+ self._check_formats(formats, track_id)
+ self._sort_formats(formats)
+ result['formats'] = formats
return result
@@ -220,7 +225,12 @@ class SoundcloudIE(InfoExtractor):
info_json_url += "&secret_token=" + token
elif mobj.group('player'):
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
- return self.url_result(query['url'][0])
+ real_url = query['url'][0]
+ # If the token is in the query of the original url we have to
+ # manually add it
+ if 'secret_token' in query:
+ real_url += '?secret_token=' + query['secret_token'][0]
+ return self.url_result(real_url)
else:
# extract uploader (which is in the url)
uploader = mobj.group('uploader')
@@ -241,7 +251,7 @@ class SoundcloudIE(InfoExtractor):
class SoundcloudSetIE(SoundcloudIE):
- _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
IE_NAME = 'soundcloud:set'
_TESTS = [{
'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
@@ -273,70 +283,153 @@ class SoundcloudSetIE(SoundcloudIE):
info = self._download_json(resolv_url, full_title)
if 'errors' in info:
- for err in info['errors']:
- self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
- return
+ msgs = (compat_str(err['error_message']) for err in info['errors'])
+ raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
+
+ entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']]
return {
'_type': 'playlist',
- 'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']],
+ 'entries': entries,
'id': '%s' % info['id'],
'title': info['title'],
}
class SoundcloudUserIE(SoundcloudIE):
- _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:(?:www|m)\.)?soundcloud\.com/
+ (?P<user>[^/]+)
+ (?:/
+ (?P<rsrc>tracks|sets|reposts|likes|spotlight)
+ )?
+ /?(?:[?#].*)?$
+ '''
IE_NAME = 'soundcloud:user'
_TESTS = [{
- 'url': 'https://soundcloud.com/the-concept-band',
+ 'url': 'https://soundcloud.com/the-akashic-chronicler',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (All)',
+ },
+ 'playlist_mincount': 111,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Tracks)',
+ },
+ 'playlist_mincount': 50,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/sets',
'info_dict': {
- 'id': '9615865',
- 'title': 'The Royal Concept',
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Playlists)',
},
- 'playlist_mincount': 12
+ 'playlist_mincount': 3,
}, {
- 'url': 'https://soundcloud.com/the-concept-band/likes',
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts',
'info_dict': {
- 'id': '9615865',
- 'title': 'The Royal Concept',
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Reposts)',
+ },
+ 'playlist_mincount': 7,
+ }, {
+ 'url': 'https://soundcloud.com/the-akashic-chronicler/likes',
+ 'info_dict': {
+ 'id': '114582580',
+ 'title': 'The Akashic Chronicler (Likes)',
+ },
+ 'playlist_mincount': 321,
+ }, {
+ 'url': 'https://soundcloud.com/grynpyret/spotlight',
+ 'info_dict': {
+ 'id': '7098329',
+ 'title': 'Grynpyret (Spotlight)',
},
'playlist_mincount': 1,
}]
+ _API_BASE = 'https://api.soundcloud.com'
+ _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+ _BASE_URL_MAP = {
+ 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE,
+ 'tracks': '%s/users/%%s/tracks' % _API_BASE,
+ 'sets': '%s/users/%%s/playlists' % _API_V2_BASE,
+ 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE,
+ 'likes': '%s/users/%%s/likes' % _API_V2_BASE,
+ 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE,
+ }
+
+ _TITLE_MAP = {
+ 'all': 'All',
+ 'tracks': 'Tracks',
+ 'sets': 'Playlists',
+ 'reposts': 'Reposts',
+ 'likes': 'Likes',
+ 'spotlight': 'Spotlight',
+ }
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group('user')
- resource = mobj.group('rsrc')
- if resource is None:
- resource = 'tracks'
- elif resource == 'likes':
- resource = 'favorites'
url = 'http://soundcloud.com/%s/' % uploader
resolv_url = self._resolv_url(url)
user = self._download_json(
resolv_url, uploader, 'Downloading user info')
- base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource)
+
+ resource = mobj.group('rsrc') or 'all'
+ base_url = self._BASE_URL_MAP[resource] % user['id']
+
+ next_href = None
entries = []
for i in itertools.count():
- data = compat_urllib_parse.urlencode({
- 'offset': i * 50,
- 'limit': 50,
- 'client_id': self._CLIENT_ID,
- })
- new_entries = self._download_json(
- base_url + data, uploader, 'Downloading track page %s' % (i + 1))
- if len(new_entries) == 0:
+ if not next_href:
+ data = compat_urllib_parse.urlencode({
+ 'offset': i * 50,
+ 'limit': 50,
+ 'client_id': self._CLIENT_ID,
+ 'linked_partitioning': '1',
+ 'representation': 'speedy',
+ })
+ next_href = base_url + '?' + data
+
+ response = self._download_json(
+ next_href, uploader, 'Downloading track page %s' % (i + 1))
+
+ collection = response['collection']
+
+ if not collection:
self.to_screen('%s: End page received' % uploader)
break
- entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries)
+
+ def resolve_permalink_url(candidates):
+ for cand in candidates:
+ if isinstance(cand, dict):
+ permalink_url = cand.get('permalink_url')
+ if permalink_url and permalink_url.startswith('http'):
+ return permalink_url
+
+ for e in collection:
+ permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
+ if permalink_url:
+ entries.append(self.url_result(permalink_url))
+
+ if 'next_href' in response:
+ next_href = response['next_href']
+ if not next_href:
+ break
+ else:
+ next_href = None
return {
'_type': 'playlist',
'id': compat_str(user['id']),
- 'title': user['username'],
+ 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),
'entries': entries,
}
@@ -371,9 +464,7 @@ class SoundcloudPlaylistIE(SoundcloudIE):
data = self._download_json(
base_url + data, playlist_id, 'Downloading playlist')
- entries = [
- self._extract_info_dict(t, quiet=True, secret_token=token)
- for t in data['tracks']]
+ entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']]
return {
'_type': 'playlist',
@@ -382,3 +473,60 @@ class SoundcloudPlaylistIE(SoundcloudIE):
'description': data.get('description'),
'entries': entries,
}
+
+
+class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
+ IE_NAME = 'soundcloud:search'
+ IE_DESC = 'Soundcloud search'
+ _MAX_RESULTS = float('inf')
+ _TESTS = [{
+ 'url': 'scsearch15:post-avant jazzcore',
+ 'info_dict': {
+ 'title': 'post-avant jazzcore',
+ },
+ 'playlist_count': 15,
+ }]
+
+ _SEARCH_KEY = 'scsearch'
+ _MAX_RESULTS_PER_PAGE = 200
+ _DEFAULT_RESULTS_PER_PAGE = 50
+ _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+ def _get_collection(self, endpoint, collection_id, **query):
+ limit = min(
+ query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
+ self._MAX_RESULTS_PER_PAGE)
+ query['limit'] = limit
+ query['client_id'] = self._CLIENT_ID
+ query['linked_partitioning'] = '1'
+ query['offset'] = 0
+ data = compat_urllib_parse.urlencode(encode_dict(query))
+ next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data)
+
+ collected_results = 0
+
+ for i in itertools.count(1):
+ response = self._download_json(
+ next_url, collection_id, 'Downloading page {0}'.format(i),
+ 'Unable to download API page')
+
+ collection = response.get('collection', [])
+ if not collection:
+ break
+
+ collection = list(filter(bool, collection))
+ collected_results += len(collection)
+
+ for item in collection:
+ yield self.url_result(item['uri'], SoundcloudIE.ie_key())
+
+ if not collection or collected_results >= limit:
+ break
+
+ next_url = response.get('next_href')
+ if not next_url:
+ break
+
+ def _get_n_results(self, query, n):
+ tracks = self._get_collection('/search/tracks', query, limit=n, q=query)
+ return self.playlist_result(tracks, playlist_title=query)
diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py
index a4f8ce6c3..3a4ddf57e 100644
--- a/youtube_dl/extractor/soundgasm.py
+++ b/youtube_dl/extractor/soundgasm.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
class SoundgasmIE(InfoExtractor):
+ IE_NAME = 'soundgasm'
_VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)'
_TEST = {
'url': 'http://soundgasm.net/u/ytdl/Piano-sample',
@@ -38,3 +39,26 @@ class SoundgasmIE(InfoExtractor):
'title': audio_title,
'description': description
}
+
+
+class SoundgasmProfileIE(InfoExtractor):
+ IE_NAME = 'soundgasm:profile'
+ _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$'
+ _TEST = {
+ 'url': 'http://soundgasm.net/u/ytdl',
+ 'info_dict': {
+ 'id': 'ytdl',
+ },
+ 'playlist_count': 1,
+ }
+
+ def _real_extract(self, url):
+ profile_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, profile_id)
+
+ entries = [
+ self.url_result(audio_url, 'Soundgasm')
+ for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)]
+
+ return self.playlist_result(entries, profile_id)
diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py
index c20397b3d..87b650468 100644
--- a/youtube_dl/extractor/southpark.py
+++ b/youtube_dl/extractor/southpark.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
@@ -5,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor
class SouthParkIE(MTVServicesInfoExtractor):
IE_NAME = 'southpark.cc.com'
- _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
@@ -20,9 +21,20 @@ class SouthParkIE(MTVServicesInfoExtractor):
}]
-class SouthparkDeIE(SouthParkIE):
+class SouthParkEsIE(SouthParkIE):
+ IE_NAME = 'southpark.cc.com:español'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))'
+ _LANG = 'es'
+
+ _TESTS = [{
+ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
+ 'playlist_count': 4,
+ }]
+
+
+class SouthParkDeIE(SouthParkIE):
IE_NAME = 'southpark.de'
- _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
_TESTS = [{
@@ -33,4 +45,34 @@ class SouthparkDeIE(SouthParkIE):
'title': 'The Government Won\'t Respect My Privacy',
'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
},
+ }, {
+ # non-ASCII characters in initial URL
+ 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen',
+ 'playlist_count': 4,
+ }, {
+ # non-ASCII characters in redirect URL
+ 'url': 'http://www.southpark.de/alle-episoden/s18e09',
+ 'playlist_count': 4,
+ }]
+
+
+class SouthParkNlIE(SouthParkIE):
+ IE_NAME = 'southpark.nl'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+ _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/'
+
+ _TESTS = [{
+ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free',
+ 'playlist_count': 4,
+ }]
+
+
+class SouthParkDkIE(SouthParkIE):
+ IE_NAME = 'southparkstudios.dk'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.dk/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+ _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/'
+
+ _TESTS = [{
+ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop',
+ 'playlist_count': 4,
}]
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
index c2d0d36a6..ebb5d6ec0 100644
--- a/youtube_dl/extractor/space.py
+++ b/youtube_dl/extractor/space.py
@@ -3,14 +3,14 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .brightcove import BrightcoveIE
+from .brightcove import BrightcoveLegacyIE
from ..utils import RegexNotFoundError, ExtractorError
class SpaceIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
_TEST = {
- 'add_ie': ['Brightcove'],
+ 'add_ie': ['BrightcoveLegacy'],
'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
'info_dict': {
'id': '2780937028001',
@@ -31,8 +31,8 @@ class SpaceIE(InfoExtractor):
brightcove_url = self._og_search_video_url(webpage)
except RegexNotFoundError:
# Other videos works fine with the info from the object
- brightcove_url = BrightcoveIE._extract_brightcove_url(webpage)
+ brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
if brightcove_url is None:
raise ExtractorError(
'The webpage does not contain a video', expected=True)
- return self.url_result(brightcove_url, BrightcoveIE.ie_key())
+ return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key())
diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py
new file mode 100644
index 000000000..7f060b15b
--- /dev/null
+++ b/youtube_dl/extractor/spankbang.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class SpankBangIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video'
+ _TEST = {
+ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
+ 'md5': '1cc433e1d6aa14bc376535b8679302f7',
+ 'info_dict': {
+ 'id': '3vvn',
+ 'ext': 'mp4',
+ 'title': 'fantasy solo',
+ 'description': 'dillion harper masturbates on a bed',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'silly2587',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ stream_key = self._html_search_regex(
+ r'''var\s+stream_key\s*=\s*['"](.+?)['"]''',
+ webpage, 'stream key')
+
+ formats = [{
+ 'url': 'http://spankbang.com/_%s/%s/title/%sp__mp4' % (video_id, stream_key, height),
+ 'ext': 'mp4',
+ 'format_id': '%sp' % height,
+ 'height': int(height),
+ } for height in re.findall(r'<span[^>]+q_(\d+)p', webpage)]
+ self._sort_formats(formats)
+
+ title = self._html_search_regex(
+ r'(?s)<h1>(.+?)</h1>', webpage, 'title')
+ description = self._search_regex(
+ r'class="desc"[^>]*>([^<]+)',
+ webpage, 'description', default=None)
+ thumbnail = self._og_search_thumbnail(webpage)
+ uploader = self._search_regex(
+ r'class="user"[^>]*>([^<]+)',
+ webpage, 'uploader', fatal=False)
+
+ age_limit = self._rta_search(webpage)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'formats': formats,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index b936202f6..692fd78e8 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -4,11 +4,11 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
- compat_urllib_request,
)
from ..utils import (
+ sanitized_Request,
str_to_int,
unified_strdate,
)
@@ -16,8 +16,9 @@ from ..aes import aes_decrypt_text
class SpankwireIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<id>[0-9]+)/?)'
+ _TESTS = [{
+ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4
'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
'md5': '8bbfde12b101204b39e4b9fe7eb67095',
'info_dict': {
@@ -27,24 +28,37 @@ class SpankwireIE(InfoExtractor):
'description': 'Crazy Bitch X rated music video.',
'uploader': 'oreusz',
'uploader_id': '124697',
- 'upload_date': '20070508',
+ 'upload_date': '20070507',
'age_limit': 18,
}
- }
+ }, {
+ # download URL pattern: */mp4_<format_id>_<video_id>.mp4
+ 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/',
+ 'md5': '09b3c20833308b736ae8902db2f8d7e6',
+ 'info_dict': {
+ 'id': '1921551',
+ 'ext': 'mp4',
+ 'title': 'Titcums Compiloation I',
+ 'description': 'cum on tits',
+ 'uploader': 'dannyh78999',
+ 'uploader_id': '3056053',
+ 'upload_date': '20150822',
+ 'age_limit': 18,
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ video_id = mobj.group('id')
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request('http://www.' + mobj.group('url'))
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
title = self._html_search_regex(
r'<h1>([^<]+)', webpage, 'title')
description = self._html_search_regex(
- r'<div\s+id="descriptionContent">([^<]+)<',
+ r'(?s)<div\s+id="descriptionContent">(.+?)</div>',
webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']',
@@ -54,7 +68,7 @@ class SpankwireIE(InfoExtractor):
r'by:\s*<a [^>]*>(.+?)</a>',
webpage, 'uploader', fatal=False)
uploader_id = self._html_search_regex(
- r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"',
+ r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"',
webpage, 'uploader id', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
r'</a> on (.+?) at \d+:\d+',
@@ -64,14 +78,15 @@ class SpankwireIE(InfoExtractor):
r'<div id="viewsCounter"><span>([\d,\.]+)</span> views</div>',
webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
- r'Comments<span[^>]+>\s*\(([\d,\.]+)\)</span>',
+ r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>',
webpage, 'comment count', fatal=False))
- video_urls = list(map(
- compat_urllib_parse.unquote,
- re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage)))
+ videos = re.findall(
+ r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage)
+ heights = [int(video[0]) for video in videos]
+ video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos]))
if webpage.find('flashvars\.encrypted = "true"') != -1:
- password = self._html_search_regex(
+ password = self._search_regex(
r'flashvars\.video_title = "([^"]+)',
webpage, 'password').replace('+', ' ')
video_urls = list(map(
@@ -79,21 +94,22 @@ class SpankwireIE(InfoExtractor):
video_urls))
formats = []
- for video_url in video_urls:
+ for height, video_url in zip(heights, video_urls):
path = compat_urllib_parse_urlparse(video_url).path
- format = path.split('/')[4].split('_')[:2]
- resolution, bitrate_str = format
- format = "-".join(format)
- height = int(resolution.rstrip('Pp'))
- tbr = int(bitrate_str.rstrip('Kk'))
- formats.append({
+ _, quality = path.split('/')[4].split('_')[:2]
+ f = {
'url': video_url,
- 'resolution': resolution,
- 'format': format,
- 'tbr': tbr,
'height': height,
- 'format_id': format,
- })
+ }
+ tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None)
+ if tbr:
+ f.update({
+ 'tbr': int(tbr),
+ 'format_id': '%dp' % height,
+ })
+ else:
+ f['format_id'] = quality
+ formats.append(f)
self._sort_formats(formats)
age_limit = self._rta_search(webpage)
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index b868241d5..5bd3c0087 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -9,7 +9,7 @@ from .spiegeltv import SpiegeltvIE
class SpiegelIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$'
+ _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
'md5': '2c2754212136f35fb4b19767d242f66e',
@@ -39,6 +39,9 @@ class SpiegelIE(InfoExtractor):
'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
}
+ }, {
+ 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
index 98cf92d89..034bd47ff 100644
--- a/youtube_dl/extractor/spiegeltv.py
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -2,7 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+ determine_ext,
+ float_or_none,
+)
class SpiegeltvIE(InfoExtractor):
@@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg$',
},
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
}
}, {
@@ -51,9 +55,43 @@ class SpiegeltvIE(InfoExtractor):
is_wide = media_json['is_wide']
server_json = self._download_json(
- 'http://www.spiegel.tv/streaming_servers/', video_id,
- note='Downloading server information')
- server = server_json[0]['endpoint']
+ 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
+ video_id, note='Downloading server information')
+
+ format = '16x9' if is_wide else '4x3'
+
+ formats = []
+ for streamingserver in server_json['streamingserver']:
+ endpoint = streamingserver.get('endpoint')
+ if not endpoint:
+ continue
+ play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
+ if endpoint.startswith('rtmp'):
+ formats.append({
+ 'url': endpoint,
+ 'format_id': 'rtmp',
+ 'app': compat_urllib_parse_urlparse(endpoint).path[1:],
+ 'play_path': play_path,
+ 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
+ 'ext': 'flv',
+ 'rtmp_live': True,
+ })
+ elif determine_ext(endpoint) == 'm3u8':
+ formats.append({
+ 'url': endpoint.replace('[video]', play_path),
+ 'ext': 'm4v',
+ 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction
+ 'protocol': 'm3u8',
+ 'preference': 1,
+ 'http_headers': {
+ 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side
+ },
+ })
+ else:
+ formats.append({
+ 'url': endpoint,
+ })
+ self._check_formats(formats, video_id)
thumbnails = []
for image in media_json['images']:
@@ -65,16 +103,12 @@ class SpiegeltvIE(InfoExtractor):
description = media_json['subtitle']
duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
- format = '16x9' if is_wide else '4x3'
-
- url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
return {
'id': video_id,
'title': title,
- 'url': url,
- 'ext': 'm4v',
'description': description,
'duration': duration,
- 'thumbnails': thumbnails
+ 'thumbnails': thumbnails,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py
index e529bb55c..182f286df 100644
--- a/youtube_dl/extractor/spike.py
+++ b/youtube_dl/extractor/spike.py
@@ -5,7 +5,7 @@ from .mtv import MTVServicesInfoExtractor
class SpikeIE(MTVServicesInfoExtractor):
_VALID_URL = r'''(?x)https?://
- (?:www\.spike\.com/(?:video-clips|(?:full-)?episodes)/.+|
+ (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+|
m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+))
'''
_TEST = {
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index becdf658f..86d509ae5 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -4,37 +4,36 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
- parse_duration,
- parse_iso8601,
+ unified_strdate,
)
class SportBoxIE(InfoExtractor):
- _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
- _TESTS = [
- {
- 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
- 'md5': 'ff56a598c2cf411a9a38a69709e97079',
- 'info_dict': {
- 'id': '80822',
- 'ext': 'mp4',
- 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
- 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'timestamp': 1411896237,
- 'upload_date': '20140928',
- 'duration': 4846,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
- 'only_matching': True,
- }
- ]
+ _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)'
+ _TESTS = [{
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S',
+ 'md5': 'ff56a598c2cf411a9a38a69709e97079',
+ 'info_dict': {
+ 'id': '80822',
+ 'ext': 'mp4',
+ 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн',
+ 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140928',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -42,35 +41,75 @@ class SportBoxIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'src="/vdl/player/media/(\d+)"', webpage, 'video id')
+ player = self._search_regex(
+ r'src="/?(vdl/player/[^"]+)"', webpage, 'player')
+
+ title = self._html_search_regex(
+ [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'],
+ webpage, 'title')
+ description = self._og_search_description(webpage) or self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = self._og_search_thumbnail(webpage)
+ upload_date = unified_strdate(self._html_search_meta(
+ 'dateCreated', webpage, 'upload date'))
+
+ return {
+ '_type': 'url_transparent',
+ 'url': compat_urlparse.urljoin(url, '/%s' % player),
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
- player = self._download_webpage(
- 'http://news.sportbox.ru/vdl/player/media/%s' % video_id,
- display_id, 'Downloading player webpage')
+
+class SportBoxEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
+ 'info_dict': {
+ 'id': '211355',
+ 'ext': 'mp4',
+ 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
+ webpage)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
hls = self._search_regex(
- r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file')
+ r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
+ webpage, 'hls file')
- formats = self._extract_m3u8_formats(hls, display_id, 'mp4')
+ formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
- title = self._html_search_regex(
- r'<h1 itemprop="name">([^<]+)</h1>', webpage, 'title')
- description = self._html_search_regex(
- r'(?s)<div itemprop="description">(.+?)</div>', webpage, 'description', fatal=False)
- thumbnail = self._og_search_thumbnail(webpage)
- timestamp = parse_iso8601(self._search_regex(
- r'<span itemprop="uploadDate">([^<]+)</span>', webpage, 'timestamp', fatal=False))
- duration = parse_duration(self._html_search_regex(
- r'<meta itemprop="duration" content="PT([^"]+)">', webpage, 'duration', fatal=False))
+ title = self._search_regex(
+ r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
+
+ thumbnail = self._search_regex(
+ r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
+ webpage, 'thumbnail', default=None)
return {
'id': video_id,
- 'display_id': display_id,
'title': title,
- 'description': description,
'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'duration': duration,
'formats': formats,
}
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py
index 1a57aebf1..ebb75f059 100644
--- a/youtube_dl/extractor/sportdeutschland.py
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -4,11 +4,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
parse_iso8601,
+ sanitized_Request,
)
@@ -38,10 +36,12 @@ class SportDeutschlandIE(InfoExtractor):
'upload_date': '20140825',
'description': 'md5:60a20536b57cee7d9a4ec005e8687504',
'timestamp': 1408976060,
+ 'duration': 2732,
'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee',
'thumbnail': 're:^https?://.*\.jpg$',
'view_count': int,
'categories': ['Li-Ning Badminton WM 2014'],
+
}
}]
@@ -50,20 +50,19 @@ class SportDeutschlandIE(InfoExtractor):
video_id = mobj.group('id')
sport_id = mobj.group('sport')
- api_url = 'http://splink.tv/api/permalinks/%s/%s' % (
+ api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % (
sport_id, video_id)
- req = compat_urllib_request.Request(api_url, headers={
+ req = sanitized_Request(api_url, headers={
'Accept': 'application/vnd.vidibus.v2.html+json',
'Referer': url,
})
data = self._download_json(req, video_id)
- categories = list(data.get('section', {}).get('tags', {}).values())
asset = data['asset']
- assets_info = self._download_json(asset['url'], video_id)
+ categories = [data['section']['title']]
formats = []
- smil_url = assets_info['video']
+ smil_url = asset['video']
if '.smil' in smil_url:
m3u8_url = smil_url.replace('.smil', '.m3u8')
formats.extend(
@@ -91,6 +90,7 @@ class SportDeutschlandIE(InfoExtractor):
'title': asset['title'],
'thumbnail': asset.get('image'),
'description': asset.get('teaser'),
+ 'duration': asset.get('duration'),
'categories': categories,
'view_count': asset.get('views'),
'rtmp_live': asset.get('live'),
diff --git a/youtube_dl/extractor/srf.py b/youtube_dl/extractor/srf.py
new file mode 100644
index 000000000..77eec0bc7
--- /dev/null
+++ b/youtube_dl/extractor/srf.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ parse_iso8601,
+ xpath_text,
+)
+
+
+class SrfIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/tv/[^/]+/video/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})'
+ _TESTS = [{
+ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+ 'md5': '4cd93523723beff51bb4bee974ee238d',
+ 'info_dict': {
+ 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+ 'display_id': 'snowden-beantragt-asyl-in-russland',
+ 'ext': 'm4v',
+ 'upload_date': '20130701',
+ 'title': 'Snowden beantragt Asyl in Russland',
+ 'timestamp': 1372713995,
+ }
+ }, {
+ # No Speichern (Save) button
+ 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa',
+ 'md5': 'd97e236e80d1d24729e5d0953d276a4f',
+ 'info_dict': {
+ 'id': '677f5829-e473-4823-ac83-a1087fe97faa',
+ 'display_id': 'jaguar-xk120-shadow-und-tornado-dampflokomotive',
+ 'ext': 'flv',
+ 'upload_date': '20130710',
+ 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive',
+ 'timestamp': 1373493600,
+ },
+ }, {
+ 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tp.srgssr.ch/p/flash?urn=urn:srf:ais:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ display_id = re.match(self._VALID_URL, url).group('display_id') or video_id
+
+ video_data = self._download_xml(
+ 'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/video/play/%s.xml' % video_id,
+ display_id)
+
+ title = xpath_text(
+ video_data, './AssetMetadatas/AssetMetadata/title', fatal=True)
+ thumbnails = [{
+ 'url': s.text
+ } for s in video_data.findall('.//ImageRepresentation/url')]
+ timestamp = parse_iso8601(xpath_text(video_data, './createdDate'))
+ # The <duration> field in XML is different from the exact duration, skipping
+
+ formats = []
+ for item in video_data.findall('./Playlists/Playlist') + video_data.findall('./Downloads/Download'):
+ for url_node in item.findall('url'):
+ quality = url_node.attrib['quality']
+ full_url = url_node.text
+ original_ext = determine_ext(full_url)
+ format_id = '%s-%s' % (quality, item.attrib['protocol'])
+ if original_ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ full_url + '?hdcore=3.4.0', display_id, f4m_id=format_id))
+ elif original_ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ full_url, display_id, 'mp4', m3u8_id=format_id))
+ else:
+ formats.append({
+ 'url': full_url,
+ 'ext': original_ext,
+ 'format_id': format_id,
+ 'quality': 0 if 'HD' in quality else -1,
+ 'preference': 1,
+ })
+
+ self._sort_formats(formats)
+
+ subtitles = {}
+ subtitles_data = video_data.find('Subtitles')
+ if subtitles_data is not None:
+ subtitles_list = [{
+ 'url': sub.text,
+ 'ext': determine_ext(sub.text),
+ } for sub in subtitles_data]
+ if subtitles_list:
+ subtitles['de'] = subtitles_list
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'formats': formats,
+ 'title': title,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py
new file mode 100644
index 000000000..13101c714
--- /dev/null
+++ b/youtube_dl/extractor/ssa.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ unescapeHTML,
+ parse_duration,
+)
+
+
+class SSAIE(InfoExtractor):
+ _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://ssa.nls.uk/film/3561',
+ 'info_dict': {
+ 'id': '3561',
+ 'ext': 'flv',
+ 'title': 'SHETLAND WOOL',
+ 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04',
+ 'duration': 900,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ streamer = self._search_regex(
+ r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer')
+ play_path = self._search_regex(
+ r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0]
+
+ def search_field(field_name, fatal=False):
+ return self._search_regex(
+ r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name,
+ webpage, 'title', fatal=fatal)
+
+ title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]')
+ description = unescapeHTML(search_field('Description'))
+ duration = parse_duration(search_field('Running time'))
+ thumbnail = self._search_regex(
+ r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'url': streamer,
+ 'play_path': play_path,
+ 'ext': 'flv',
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py
new file mode 100644
index 000000000..d5c852f52
--- /dev/null
+++ b/youtube_dl/extractor/stitcher.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+ unescapeHTML,
+)
+
+
+class StitcherIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
+ _TESTS = [{
+ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
+ 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
+ 'info_dict': {
+ 'id': '40789481',
+ 'ext': 'mp3',
+ 'title': 'Machine Learning Mastery and Cancer Clusters',
+ 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
+ 'duration': 1604,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ }, {
+ 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
+ 'info_dict': {
+ 'id': '40846275',
+ 'display_id': 'the-rare-hourlong-comedy-plus',
+ 'ext': 'mp3',
+ 'title': "The CW's 'Crazy Ex-Girlfriend'",
+ 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
+ 'duration': 2235,
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # escaped title
+ 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ audio_id = mobj.group('id')
+ display_id = mobj.group('display_id') or audio_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ episode = self._parse_json(
+ js_to_json(self._search_regex(
+ r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')),
+ display_id)['config']['episode']
+
+ title = unescapeHTML(episode['title'])
+ formats = [{
+ 'url': episode[episode_key],
+ 'ext': determine_ext(episode[episode_key]) or 'mp3',
+ 'vcodec': 'none',
+ } for episode_key in ('episodeURL',) if episode.get(episode_key)]
+ description = self._search_regex(
+ r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
+ duration = int_or_none(episode.get('duration'))
+ thumbnail = episode.get('episodeImage')
+
+ return {
+ 'id': audio_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py
index d4e134015..77841b946 100644
--- a/youtube_dl/extractor/streamcloud.py
+++ b/youtube_dl/extractor/streamcloud.py
@@ -4,10 +4,8 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
+from ..utils import sanitized_Request
class StreamcloudIE(InfoExtractor):
@@ -43,7 +41,7 @@ class StreamcloudIE(InfoExtractor):
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
}
- req = compat_urllib_request.Request(url, post, headers)
+ req = sanitized_Request(url, post, headers)
webpage = self._download_webpage(
req, video_id, note='Downloading video page ...')
diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py
index c3ceb5f76..d3d2b7eb7 100644
--- a/youtube_dl/extractor/streamcz.py
+++ b/youtube_dl/extractor/streamcz.py
@@ -1,14 +1,28 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
+import hashlib
+import time
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ sanitized_Request,
)
+def _get_api_key(api_path):
+ if api_path.endswith('?'):
+ api_path = api_path[:-1]
+
+ api_key = 'fb5f58a820353bd7095de526253c14fd'
+ a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600)))
+ return hashlib.md5(a.encode('ascii')).hexdigest()
+
+
class StreamCZIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)'
+ _API_URL = 'http://www.stream.cz/API'
_TESTS = [{
'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
@@ -36,8 +50,11 @@ class StreamCZIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- data = self._download_json(
- 'http://www.stream.cz/API/episode/%s' % video_id, video_id)
+ api_path = '/episode/%s' % video_id
+
+ req = sanitized_Request(self._API_URL + api_path)
+ req.add_header('Api-Password', _get_api_key(api_path))
+ data = self._download_json(req, video_id)
formats = []
for quality, video in enumerate(data['video_qualities']):
diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py
deleted file mode 100644
index 59a51268d..000000000
--- a/youtube_dl/extractor/subtitles.py
+++ /dev/null
@@ -1,99 +0,0 @@
-from __future__ import unicode_literals
-from .common import InfoExtractor
-
-from ..compat import compat_str
-from ..utils import (
- ExtractorError,
-)
-
-
-class SubtitlesInfoExtractor(InfoExtractor):
- @property
- def _have_to_download_any_subtitles(self):
- return any([self._downloader.params.get('writesubtitles', False),
- self._downloader.params.get('writeautomaticsub')])
-
- def _list_available_subtitles(self, video_id, webpage):
- """ outputs the available subtitles for the video """
- sub_lang_list = self._get_available_subtitles(video_id, webpage)
- auto_captions_list = self._get_available_automatic_caption(video_id, webpage)
- sub_lang = ",".join(list(sub_lang_list.keys()))
- self.to_screen('%s: Available subtitles for video: %s' %
- (video_id, sub_lang))
- auto_lang = ",".join(auto_captions_list.keys())
- self.to_screen('%s: Available automatic captions for video: %s' %
- (video_id, auto_lang))
-
- def extract_subtitles(self, video_id, webpage):
- """
- returns {sub_lang: sub} ,{} if subtitles not found or None if the
- subtitles aren't requested.
- """
- if not self._have_to_download_any_subtitles:
- return None
- available_subs_list = {}
- if self._downloader.params.get('writeautomaticsub', False):
- available_subs_list.update(self._get_available_automatic_caption(video_id, webpage))
- if self._downloader.params.get('writesubtitles', False):
- available_subs_list.update(self._get_available_subtitles(video_id, webpage))
-
- if not available_subs_list: # error, it didn't get the available subtitles
- return {}
- if self._downloader.params.get('allsubtitles', False):
- sub_lang_list = available_subs_list
- else:
- if self._downloader.params.get('subtitleslangs', False):
- requested_langs = self._downloader.params.get('subtitleslangs')
- elif 'en' in available_subs_list:
- requested_langs = ['en']
- else:
- requested_langs = [list(available_subs_list.keys())[0]]
-
- sub_lang_list = {}
- for sub_lang in requested_langs:
- if sub_lang not in available_subs_list:
- self._downloader.report_warning('no closed captions found in the specified language "%s"' % sub_lang)
- continue
- sub_lang_list[sub_lang] = available_subs_list[sub_lang]
-
- subtitles = {}
- for sub_lang, url in sub_lang_list.items():
- subtitle = self._request_subtitle_url(sub_lang, url)
- if subtitle:
- subtitles[sub_lang] = subtitle
- return subtitles
-
- def _download_subtitle_url(self, sub_lang, url):
- return self._download_webpage(url, None, note=False)
-
- def _request_subtitle_url(self, sub_lang, url):
- """ makes the http request for the subtitle """
- try:
- sub = self._download_subtitle_url(sub_lang, url)
- except ExtractorError as err:
- self._downloader.report_warning('unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
- return
- if not sub:
- self._downloader.report_warning('Did not fetch video subtitles')
- return
- return sub
-
- def _get_available_subtitles(self, video_id, webpage):
- """
- returns {sub_lang: url} or {} if not available
- Must be redefined by the subclasses
- """
-
- # By default, allow implementations to simply pass in the result
- assert isinstance(webpage, dict), \
- '_get_available_subtitles not implemented'
- return webpage
-
- def _get_available_automatic_caption(self, video_id, webpage):
- """
- returns {sub_lang: url} or {} if not available
- Must be redefined by the subclasses that support automatic captions,
- otherwise it will return {}
- """
- self._downloader.report_warning('Automatic Captions not supported by this server')
- return {}
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py
index 8a333f1d2..e527aa971 100644
--- a/youtube_dl/extractor/sunporno.py
+++ b/youtube_dl/extractor/sunporno.py
@@ -44,7 +44,7 @@ class SunPornoIE(InfoExtractor):
webpage, 'duration', fatal=False))
view_count = int_or_none(self._html_search_regex(
- r'class="views">\s*(\d+)\s*<',
+ r'class="views">(?:<noscript>)?\s*(\d+)\s*<',
webpage, 'view count', fatal=False))
comment_count = int_or_none(self._html_search_regex(
r'(\d+)</b> Comments?',
@@ -52,7 +52,7 @@ class SunPornoIE(InfoExtractor):
formats = []
quality = qualities(['mp4', 'flv'])
- for video_url in re.findall(r'<source src="([^"]+)"', webpage):
+ for video_url in re.findall(r'<(?:source|video) src="([^"]+)"', webpage):
video_ext = determine_ext(video_url)
formats.append({
'url': video_url,
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
new file mode 100644
index 000000000..fc20f664b
--- /dev/null
+++ b/youtube_dl/extractor/svt.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+)
+
+
+class SVTBaseIE(InfoExtractor):
+ def _extract_video(self, url, video_id):
+ info = self._download_json(url, video_id)
+
+ title = info['context']['title']
+ thumbnail = info['context'].get('thumbnailImage')
+
+ video_info = info['video']
+ formats = []
+ for vr in video_info['videoReferences']:
+ vurl = vr['url']
+ ext = determine_ext(vurl)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ vurl, video_id,
+ ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id=vr.get('playerType')))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ vurl + '?hdcore=3.3.0', video_id,
+ f4m_id=vr.get('playerType')))
+ else:
+ formats.append({
+ 'format_id': vr.get('playerType'),
+ 'url': vurl,
+ })
+ self._sort_formats(formats)
+
+ duration = video_info.get('materialLength')
+ age_limit = 18 if video_info.get('inappropriateForChildren') else 0
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'age_limit': age_limit,
+ }
+
+
+class SVTIE(SVTBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
+ 'md5': '9648197555fc1b49e3dc22db4af51d46',
+ 'info_dict': {
+ 'id': '2900353',
+ 'ext': 'flv',
+ 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+ 'duration': 27,
+ 'age_limit': 0,
+ },
+ }
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ widget_id = mobj.group('widget_id')
+ article_id = mobj.group('id')
+ return self._extract_video(
+ 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
+ article_id)
+
+
+class SVTPlayIE(SVTBaseIE):
+ IE_DESC = 'SVT Play and Öppet arkiv'
+ _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
+ 'md5': 'ade3def0643fa1c40587a422f98edfd9',
+ 'info_dict': {
+ 'id': '2609989',
+ 'ext': 'flv',
+ 'title': 'SM veckan vinter, Örebro - Rally, final',
+ 'duration': 4500,
+ 'thumbnail': 're:^https?://.*[\.-]jpg$',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318',
+ 'md5': 'c3101a17ce9634f4c1f9800f0746c187',
+ 'info_dict': {
+ 'id': '1058509',
+ 'ext': 'flv',
+ 'title': 'Farlig kryssning',
+ 'duration': 2566,
+ 'thumbnail': 're:^https?://.*[\.-]jpg$',
+ 'age_limit': 0,
+ },
+ 'skip': 'Only works from Sweden',
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ host = mobj.group('host')
+ return self._extract_video(
+ 'http://www.%s.se/video/%s?output=json' % (host, video_id),
+ video_id)
diff --git a/youtube_dl/extractor/svtplay.py b/youtube_dl/extractor/svtplay.py
deleted file mode 100644
index eadb9ccb4..000000000
--- a/youtube_dl/extractor/svtplay.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- determine_ext,
-)
-
-
-class SVTPlayIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?svtplay\.se/video/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final',
- 'md5': 'f4a184968bc9c802a9b41316657aaa80',
- 'info_dict': {
- 'id': '2609989',
- 'ext': 'mp4',
- 'title': 'SM veckan vinter, Örebro - Rally, final',
- 'duration': 4500,
- 'thumbnail': 're:^https?://.*[\.-]jpg$',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- info = self._download_json(
- 'http://www.svtplay.se/video/%s?output=json' % video_id, video_id)
-
- title = info['context']['title']
- thumbnail = info['context'].get('thumbnailImage')
-
- video_info = info['video']
- formats = []
- for vr in video_info['videoReferences']:
- vurl = vr['url']
- if determine_ext(vurl) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- vurl, video_id,
- ext='mp4', entry_protocol='m3u8_native',
- m3u8_id=vr.get('playerType')))
- else:
- formats.append({
- 'format_id': vr.get('playerType'),
- 'url': vurl,
- })
- self._sort_formats(formats)
-
- duration = video_info.get('materialLength')
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'duration': duration,
- }
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
index bfe07b024..73e7657d4 100644
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@@ -8,17 +8,17 @@ from ..utils import parse_filesize
class TagesschauIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_[^/#?]+?)?\.html'
_TESTS = [{
- 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
- 'md5': 'bcdeac2194fb296d599ce7929dfa4009',
+ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
+ 'md5': '917a228bc7df7850783bc47979673a09',
'info_dict': {
- 'id': '1399128',
+ 'id': '102143',
'ext': 'mp4',
- 'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
- 'description': 'md5:69da3c61275b426426d711bde96463ab',
- 'thumbnail': 're:^http:.*\.jpg$',
+ 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
+ 'description': 'md5:171feccd9d9b3dd54d05d501568f6359',
+ 'thumbnail': 're:^https?:.*\.jpg$',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
@@ -28,8 +28,39 @@ class TagesschauIE(InfoExtractor):
'ext': 'mp4',
'description': 'md5:695c01bfd98b7e313c501386327aea59',
'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
- 'thumbnail': 're:^http:.*\.jpg$',
- }
+ 'thumbnail': 're:^https?:.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html',
+ 'md5': 'aef45de271c4bf0a5db834aa40bf774c',
+ 'info_dict': {
+ 'id': '18407',
+ 'ext': 'mp3',
+ 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+ 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+ 'thumbnail': 're:^https?:.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
+ 'only_matching': True,
}]
_FORMATS = {
@@ -49,19 +80,26 @@ class TagesschauIE(InfoExtractor):
playerpage = self._download_webpage(
player_url, display_id, 'Downloading player page')
- medias = re.findall(
- r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
- playerpage)
formats = []
- for url, ext, res in medias:
+ for media in re.finditer(
+ r'''(?x)
+ (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url)
+ ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type)
+ (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))?
+ ''', playerpage):
+ url = media.group('url')
+ type_ = media.group('type')
+ ext = media.group('ext')
+ res = media.group('quality')
f = {
- 'format_id': res + '_' + ext,
+ 'format_id': '%s_%s' % (res, ext) if res else ext,
'url': url,
'ext': ext,
+ 'vcodec': 'none' if type_ == 'audio' else None,
}
f.update(self._FORMATS.get(res, {}))
formats.append(f)
- thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+ thumbnail = self._og_search_thumbnail(playerpage)
title = self._og_search_title(webpage).strip()
description = self._og_search_description(webpage).strip()
else:
@@ -99,17 +137,14 @@ class TagesschauIE(InfoExtractor):
'filesize_approx': parse_filesize(m.group('filesize_approx')),
})
formats.append(format)
- thumbnail_fn = self._search_regex(
- r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
- webpage, 'thumbnail', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
description = self._html_search_regex(
r'(?s)<p class="teasertext">(.*?)</p>',
- webpage, 'description', fatal=False)
+ webpage, 'description', default=None)
title = self._html_search_regex(
r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
self._sort_formats(formats)
- thumbnail = 'http://www.tagesschau.de' + thumbnail_fn
return {
'id': display_id,
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
index f1f43d0a7..ed560bd24 100644
--- a/youtube_dl/extractor/tapely.py
+++ b/youtube_dl/extractor/tapely.py
@@ -4,19 +4,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
clean_html,
ExtractorError,
float_or_none,
parse_iso8601,
+ sanitized_Request,
)
class TapelyIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
+ _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
_API_URL = 'http://tape.ly/showtape?id={0:}'
_S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}'
_SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}'
@@ -42,6 +40,10 @@ class TapelyIE(InfoExtractor):
'ext': 'm4a',
},
},
+ {
+ 'url': 'https://tapely.com/my-grief-as-told-by-water',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -49,7 +51,7 @@ class TapelyIE(InfoExtractor):
display_id = mobj.group('id')
playlist_url = self._API_URL.format(display_id)
- request = compat_urllib_request.Request(playlist_url)
+ request = sanitized_Request(playlist_url)
request.add_header('X-Requested-With', 'XMLHttpRequest')
request.add_header('Accept', 'application/json')
request.add_header('Referer', url)
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index a73da1c9c..d1b7264b4 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -1,8 +1,18 @@
+# -*- coding: utf-8 -*-
from __future__ import unicode_literals
+import base64
+import binascii
import re
+import json
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ qualities,
+ determine_ext,
+)
+from ..compat import compat_ord
class TeamcocoIE(InfoExtractor):
@@ -16,6 +26,7 @@ class TeamcocoIE(InfoExtractor):
'ext': 'mp4',
'title': 'Conan Becomes A Mary Kay Beauty Consultant',
'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.',
+ 'duration': 504,
'age_limit': 0,
}
}, {
@@ -24,10 +35,33 @@ class TeamcocoIE(InfoExtractor):
'info_dict': {
'id': '19705',
'ext': 'mp4',
- "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one.",
- "title": "Louis C.K. Interview Pt. 1 11/3/11",
+ 'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.',
+ 'title': 'Louis C.K. Interview Pt. 1 11/3/11',
+ 'duration': 288,
'age_limit': 0,
}
+ }, {
+ 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey',
+ 'info_dict': {
+ 'id': '88748',
+ 'ext': 'mp4',
+ 'title': 'Timothy Olyphant Raises A Toast To “Justified”',
+ 'description': 'md5:15501f23f020e793aeca761205e42c24',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 downloads
+ }
+ }, {
+ 'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+ 'info_dict': {
+ 'id': '89341',
+ 'ext': 'mp4',
+ 'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ 'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 downloads
+ }
}
]
_VIDEO_ID_REGEXES = (
@@ -40,45 +74,87 @@ class TeamcocoIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
- webpage = self._download_webpage(url, display_id)
+ webpage, urlh = self._download_webpage_handle(url, display_id)
+ if 'src=expired' in urlh.geturl():
+ raise ExtractorError('This video is expired.', expected=True)
- video_id = mobj.group("video_id")
+ video_id = mobj.group('video_id')
if not video_id:
video_id = self._html_search_regex(
self._VIDEO_ID_REGEXES, webpage, 'video id')
- data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
- data = self._download_xml(
- data_url, display_id, 'Downloading data webpage')
+ data = None
+
+ preload_codes = self._html_search_regex(
+ r'(function.+)setTimeout\(function\(\)\{playlist',
+ webpage, 'preload codes')
+ base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
+ base64_fragments.remove('init')
+
+ def _check_sequence(cur_fragments):
+ if not cur_fragments:
+ return
+ for i in range(len(cur_fragments)):
+ cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii')
+ try:
+ raw_data = base64.b64decode(cur_sequence)
+ if compat_ord(raw_data[0]) == compat_ord('{'):
+ return json.loads(raw_data.decode('utf-8'))
+ except (TypeError, binascii.Error, UnicodeDecodeError, ValueError):
+ continue
+
+ def _check_data():
+ for i in range(len(base64_fragments) + 1):
+ for j in range(i, len(base64_fragments) + 1):
+ data = _check_sequence(base64_fragments[:i] + base64_fragments[j:])
+ if data:
+ return data
+
+ self.to_screen('Try to compute possible data sequence. This may take some time.')
+ data = _check_data()
+
+ if not data:
+ raise ExtractorError(
+ 'Preload information could not be extracted', expected=True)
- qualities = ['500k', '480p', '1000k', '720p', '1080p']
formats = []
- for filed in data.findall('files/file'):
- if filed.attrib.get('playmode') == 'all':
- # it just duplicates one of the entries
- break
- file_url = filed.text
- m_format = re.search(r'(\d+(k|p))\.mp4', file_url)
- if m_format is not None:
- format_id = m_format.group(1)
+ get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
+ for filed in data['files']:
+ if determine_ext(filed['url']) == 'm3u8':
+ # compat_urllib_parse.urljoin does not work here
+ if filed['url'].startswith('/'):
+ m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url']
+ else:
+ m3u8_url = filed['url']
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4')
+ for m3u8_format in m3u8_formats:
+ if m3u8_format not in formats:
+ formats.append(m3u8_format)
+ elif determine_ext(filed['url']) == 'f4m':
+ # TODO Correct f4m extraction
+ continue
else:
- format_id = filed.attrib['bitrate']
- tbr = (
- int(filed.attrib['bitrate'])
- if filed.attrib['bitrate'].isdigit()
- else None)
-
- try:
- quality = qualities.index(format_id)
- except ValueError:
- quality = -1
- formats.append({
- 'url': file_url,
- 'ext': 'mp4',
- 'tbr': tbr,
- 'format_id': format_id,
- 'quality': quality,
- })
+ if filed['url'].startswith('/mp4:protected/'):
+ # TODO Correct extraction for these files
+ continue
+ m_format = re.search(r'(\d+(k|p))\.mp4', filed['url'])
+ if m_format is not None:
+ format_id = m_format.group(1)
+ else:
+ format_id = filed['bitrate']
+ tbr = (
+ int(filed['bitrate'])
+ if filed['bitrate'].isdigit()
+ else None)
+
+ formats.append({
+ 'url': filed['url'],
+ 'ext': 'mp4',
+ 'tbr': tbr,
+ 'format_id': format_id,
+ 'quality': get_quality(format_id),
+ })
self._sort_formats(formats)
@@ -86,8 +162,9 @@ class TeamcocoIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'formats': formats,
- 'title': self._og_search_title(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'description': self._og_search_description(webpage),
+ 'title': data['title'],
+ 'thumbnail': data.get('thumb', {}).get('href'),
+ 'description': data.get('teaser'),
+ 'duration': data.get('duration'),
'age_limit': self._family_friendly_search(webpage),
}
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 10b3b706a..a48d77c30 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -3,14 +3,14 @@ from __future__ import unicode_literals
import json
import re
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
-from ..compat import (
- compat_str,
-)
+from ..compat import compat_str
+from ..utils import int_or_none
-class TEDIE(SubtitlesInfoExtractor):
+class TEDIE(InfoExtractor):
+ IE_NAME = 'ted'
_VALID_URL = r'''(?x)
(?P<proto>https?://)
(?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
@@ -83,6 +83,22 @@ class TEDIE(SubtitlesInfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # YouTube video
+ 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
+ 'add_ie': ['Youtube'],
+ 'info_dict': {
+ 'id': 'aFBIPO-P7LM',
+ 'ext': 'mp4',
+ 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
+ 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
+ 'uploader': 'TEDx Talks',
+ 'uploader_id': 'TEDxTalks',
+ 'upload_date': '20111216',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
_NATIVE_FORMATS = {
@@ -132,11 +148,16 @@ class TEDIE(SubtitlesInfoExtractor):
talk_info = self._extract_info(webpage)['talks'][0]
- if talk_info.get('external') is not None:
- self.to_screen('Found video from %s' % talk_info['external']['service'])
+ external = talk_info.get('external')
+ if external:
+ service = external['service']
+ self.to_screen('Found video from %s' % service)
+ ext_url = None
+ if service.lower() == 'youtube':
+ ext_url = external.get('code')
return {
'_type': 'url',
- 'url': talk_info['external']['uri'],
+ 'url': ext_url or external['uri'],
}
formats = [{
@@ -149,25 +170,54 @@ class TEDIE(SubtitlesInfoExtractor):
finfo = self._NATIVE_FORMATS.get(f['format_id'])
if finfo:
f.update(finfo)
- else:
- # Use rtmp downloads
- formats = [{
- 'format_id': f['name'],
- 'url': talk_info['streamer'],
- 'play_path': f['file'],
- 'ext': 'flv',
- 'width': f['width'],
- 'height': f['height'],
- 'tbr': f['bitrate'],
- } for f in talk_info['resources']['rtmp']]
+
+ for format_id, resources in talk_info['resources'].items():
+ if format_id == 'h264':
+ for resource in resources:
+ bitrate = int_or_none(resource.get('bitrate'))
+ formats.append({
+ 'url': resource['file'],
+ 'format_id': '%s-%sk' % (format_id, bitrate),
+ 'tbr': bitrate,
+ })
+ elif format_id == 'rtmp':
+ streamer = talk_info.get('streamer')
+ if not streamer:
+ continue
+ for resource in resources:
+ formats.append({
+ 'format_id': '%s-%s' % (format_id, resource.get('name')),
+ 'url': streamer,
+ 'play_path': resource['file'],
+ 'ext': 'flv',
+ 'width': int_or_none(resource.get('width')),
+ 'height': int_or_none(resource.get('height')),
+ 'tbr': int_or_none(resource.get('bitrate')),
+ })
+ elif format_id == 'hls':
+ hls_formats = self._extract_m3u8_formats(
+ resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
+ for f in hls_formats:
+ if f.get('format_id') == 'hls-meta':
+ continue
+ if not f.get('height'):
+ f['vcodec'] = 'none'
+ else:
+ f['acodec'] = 'none'
+ formats.extend(hls_formats)
+
+ audio_download = talk_info.get('audioDownload')
+ if audio_download:
+ formats.append({
+ 'url': audio_download,
+ 'format_id': 'audio',
+ 'vcodec': 'none',
+ 'preference': -0.5,
+ })
+
self._sort_formats(formats)
video_id = compat_str(talk_info['id'])
- # subtitles
- video_subtitles = self.extract_subtitles(video_id, talk_info)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, talk_info)
- return
thumbnail = talk_info['thumb']
if not thumbnail.startswith('http'):
@@ -178,21 +228,25 @@ class TEDIE(SubtitlesInfoExtractor):
'uploader': talk_info['speaker'],
'thumbnail': thumbnail,
'description': self._og_search_description(webpage),
- 'subtitles': video_subtitles,
+ 'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats,
'duration': talk_info.get('duration'),
}
- def _get_available_subtitles(self, video_id, talk_info):
+ def _get_subtitles(self, video_id, talk_info):
languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
if languages:
sub_lang_list = {}
for l in languages:
- url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
- sub_lang_list[l] = url
+ sub_lang_list[l] = [
+ {
+ 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
+ 'ext': ext,
+ }
+ for ext in ['ted', 'srt']
+ ]
return sub_lang_list
else:
- self._downloader.report_warning('video doesn\'t have subtitles')
return {}
def _watch_info(self, url, name):
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index be3f72df7..2c8e9b941 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -1,19 +1,94 @@
# coding: utf-8
from __future__ import unicode_literals
-from .mitele import MiTeleIE
+import json
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_urllib_parse_unquote,
+ compat_urlparse,
+)
+from ..utils import (
+ get_element_by_attribute,
+ parse_duration,
+ strip_jsonp,
+)
-class TelecincoIE(MiTeleIE):
- IE_NAME = 'telecinco.es'
- _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<id>.*?)\.html'
- _TEST = {
+class TelecincoIE(InfoExtractor):
+ IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
+ _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
+
+ _TESTS = [{
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
+ 'md5': '5cbef3ad5ef17bf0d21570332d140729',
'info_dict': {
'id': 'MDSVID20141015_0058',
'ext': 'mp4',
'title': 'Con Martín Berasategui, hacer un bacalao al ...',
'duration': 662,
},
- }
+ }, {
+ 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
+ 'md5': '0a5b9f3cc8b074f50a0578f823a12694',
+ 'info_dict': {
+ 'id': 'MDSVID20150916_0128',
+ 'ext': 'mp4',
+ 'title': '¿Quién es este ex futbolista con el que hablan ...',
+ 'duration': 79,
+ },
+ }, {
+ 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
+ 'md5': 'ad1bfaaba922dd4a295724b05b68f86a',
+ 'info_dict': {
+ 'id': 'MDSVID20150513_0220',
+ 'ext': 'mp4',
+ 'title': '#DOYLACARA. Con la trata no hay trato',
+ 'duration': 50,
+ },
+ }, {
+ 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ episode = self._match_id(url)
+ webpage = self._download_webpage(url, episode)
+ embed_data_json = self._search_regex(
+ r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
+ ).replace('\'', '"')
+ embed_data = json.loads(embed_data_json)
+
+ domain = embed_data['mediaUrl']
+ if not domain.startswith('http'):
+ # only happens in telecinco.es videos
+ domain = 'http://' + domain
+ info_url = compat_urlparse.urljoin(
+ domain,
+ compat_urllib_parse_unquote(embed_data['flashvars']['host'])
+ )
+ info_el = self._download_xml(info_url, episode).find('./video/info')
+
+ video_link = info_el.find('videoUrl/link').text
+ token_query = compat_urllib_parse.urlencode({'id': video_link})
+ token_info = self._download_json(
+ embed_data['flashvars']['ov_tk'] + '?' + token_query,
+ episode,
+ transform_source=strip_jsonp
+ )
+ formats = self._extract_m3u8_formats(
+ token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native')
+
+ return {
+ 'id': embed_data['videoId'],
+ 'display_id': episode,
+ 'title': info_el.find('title').text,
+ 'formats': formats,
+ 'description': get_element_by_attribute('class', 'text', webpage),
+ 'thumbnail': info_el.find('thumb').text,
+ 'duration': parse_duration(info_el.find('duration').text),
+ }
diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py
new file mode 100644
index 000000000..6f8333cfc
--- /dev/null
+++ b/youtube_dl/extractor/telegraaf.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class TelegraafIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
+ _TEST = {
+ 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
+ 'md5': '83245a9779bcc4a24454bfd53c65b6dc',
+ 'info_dict': {
+ 'id': '24353229',
+ 'ext': 'mp4',
+ 'title': 'Tikibad ontruimd wegens brand',
+ 'description': 'md5:05ca046ff47b931f9b04855015e163a4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 33,
+ },
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ playlist_url = self._search_regex(
+ r"iframe\.loadPlayer\('([^']+)'", webpage, 'player')
+
+ entries = self._extract_xspf_playlist(playlist_url, playlist_id)
+ title = remove_end(self._og_search_title(webpage), ' - VIDEO')
+ description = self._og_search_description(webpage)
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
index 466155ef8..f6694149b 100644
--- a/youtube_dl/extractor/tenplay.py
+++ b/youtube_dl/extractor/tenplay.py
@@ -2,6 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ float_or_none,
+)
class TenPlayIE(InfoExtractor):
@@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor):
if protocol == 'rtmp':
url = url.replace('&mp4:', '')
+ tbr = int_or_none(rendition.get('encodingRate'), 1000)
+
formats.append({
- 'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]),
- 'width': rendition['frameWidth'],
- 'height': rendition['frameHeight'],
- 'tbr': rendition['encodingRate'] / 1024,
- 'filesize': rendition['size'],
+ 'format_id': '_'.join(
+ ['rtmp', rendition['videoContainer'].lower(),
+ rendition['videoCodec'].lower(), '%sk' % tbr]),
+ 'width': int_or_none(rendition['frameWidth']),
+ 'height': int_or_none(rendition['frameHeight']),
+ 'tbr': tbr,
+ 'filesize': int_or_none(rendition['size']),
'protocol': protocol,
'ext': ext,
'vcodec': rendition['videoCodec'].lower(),
'container': rendition['videoContainer'].lower(),
'url': url,
})
+ self._sort_formats(formats)
return {
'id': video_id,
@@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor):
'url': json['thumbnailURL']
}],
'thumbnail': json['videoStillURL'],
- 'duration': json['length'] / 1000,
- 'timestamp': float(json['creationDate']) / 1000,
- 'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay',
- 'view_count': json['playsTotal']
+ 'duration': float_or_none(json.get('length'), 1000),
+ 'timestamp': float_or_none(json.get('creationDate'), 1000),
+ 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
+ 'view_count': int_or_none(json.get('playsTotal')),
}
diff --git a/youtube_dl/extractor/testtube.py b/youtube_dl/extractor/testtube.py
index 6a7b5e49d..26655d690 100644
--- a/youtube_dl/extractor/testtube.py
+++ b/youtube_dl/extractor/testtube.py
@@ -15,19 +15,37 @@ class TestTubeIE(InfoExtractor):
'id': '60163',
'display_id': '5-weird-ways-plants-can-eat-animals',
'duration': 275,
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': '5 Weird Ways Plants Can Eat Animals',
'description': 'Why have some plants evolved to eat meat?',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'DNews',
'uploader_id': 'dnews',
},
+ }, {
+ 'url': 'https://testtube.com/iflscience/insane-jet-ski-flipping',
+ 'info_dict': {
+ 'id': 'fAGfJ4YjVus',
+ 'ext': 'mp4',
+ 'title': 'Flipping Jet-Ski Skills | Outrageous Acts of Science',
+ 'uploader': 'Science Channel',
+ 'uploader_id': 'ScienceChannel',
+ 'upload_date': '20150203',
+ 'description': 'md5:e61374030015bae1d2e22f096d4769d6',
+ }
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
+
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+ webpage, 'youtube iframe', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube', video_id=display_id)
+
video_id = self._search_regex(
r"player\.loadRevision3Item\('video_id',\s*([0-9]+)\);",
webpage, 'video ID')
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 025d0877c..3a68eaa80 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -6,8 +6,8 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
- _TESTS = {
+ _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html'
+ _TESTS = [{
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': {
'id': '10635995',
@@ -32,7 +32,13 @@ class TF1IE(InfoExtractor):
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/theonion.py b/youtube_dl/extractor/theonion.py
index b65d8e03f..10239c906 100644
--- a/youtube_dl/extractor/theonion.py
+++ b/youtube_dl/extractor/theonion.py
@@ -4,11 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
class TheOnionIE(InfoExtractor):
- _VALID_URL = r'(?x)https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<article_id>[0-9]+)/?'
+ _VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<id>[0-9]+)/?'
_TEST = {
'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/',
'md5': '19eaa9a39cf9b9804d982e654dc791ee',
@@ -22,10 +21,8 @@ class TheOnionIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- article_id = mobj.group('article_id')
-
- webpage = self._download_webpage(url, article_id)
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'"videoId":\s(\d+),', webpage, 'video ID')
@@ -34,10 +31,6 @@ class TheOnionIE(InfoExtractor):
thumbnail = self._og_search_thumbnail(webpage)
sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage)
- if not sources:
- raise ExtractorError(
- 'No sources found for video %s' % video_id, expected=True)
-
formats = []
for src, type_ in sources:
if type_ == 'video/mp4':
@@ -54,15 +47,15 @@ class TheOnionIE(InfoExtractor):
})
elif type_ == 'application/x-mpegURL':
formats.extend(
- self._extract_m3u8_formats(src, video_id, preference=-1))
+ self._extract_m3u8_formats(src, display_id, preference=-1))
else:
self.report_warning(
'Encountered unexpected format: %s' % type_)
-
self._sort_formats(formats)
return {
'id': video_id,
+ 'display_id': display_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 110ed976d..1555aa77c 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -1,28 +1,93 @@
+# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
-import json
+import time
+import hmac
+import binascii
+import hashlib
-from .subtitles import SubtitlesInfoExtractor
+
+from .common import InfoExtractor
from ..compat import (
- compat_str,
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
)
from ..utils import (
determine_ext,
ExtractorError,
xpath_with_ns,
+ unsmuggle_url,
+ int_or_none,
+ url_basename,
+ float_or_none,
)
-_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'})
+default_ns = 'http://www.w3.org/2005/SMIL21/Language'
+_x = lambda p: xpath_with_ns(p, {'smil': default_ns})
+
+
+class ThePlatformBaseIE(InfoExtractor):
+ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
+ meta = self._download_xml(smil_url, video_id, note=note)
+ try:
+ error_msg = next(
+ n.attrib['abstract']
+ for n in meta.findall(_x('.//smil:ref'))
+ if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
+ except StopIteration:
+ pass
+ else:
+ raise ExtractorError(error_msg, expected=True)
+
+ formats = self._parse_smil_formats(
+ meta, smil_url, video_id, namespace=default_ns,
+ # the parameters are from syfy.com, other sites may use others,
+ # they also work for nbc.com
+ f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'},
+ transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src))
+
+ for _format in formats:
+ ext = determine_ext(_format['url'])
+ if ext == 'once':
+ _format['ext'] = 'mp4'
+
+ self._sort_formats(formats)
+
+ subtitles = self._parse_smil_subtitles(meta, default_ns)
+
+ return formats, subtitles
+ def get_metadata(self, path, video_id):
+ info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
+ info = self._download_json(info_url, video_id)
-class ThePlatformIE(SubtitlesInfoExtractor):
+ subtitles = {}
+ captions = info.get('captions')
+ if isinstance(captions, list):
+ for caption in captions:
+ lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
+ subtitles[lang] = [{
+ 'ext': 'srt' if mime == 'text/srt' else 'ttml',
+ 'url': src,
+ }]
+
+ return {
+ 'title': info['title'],
+ 'subtitles': subtitles,
+ 'description': info['description'],
+ 'thumbnail': info['defaultThumbnailUrl'],
+ 'duration': int_or_none(info.get('duration'), 1000),
+ }
+
+
+class ThePlatformIE(ThePlatformBaseIE):
_VALID_URL = r'''(?x)
- (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
- (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
+ (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
+ (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)'''
- _TEST = {
+ _TESTS = [{
# from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
'info_dict': {
@@ -36,104 +101,212 @@ class ThePlatformIE(SubtitlesInfoExtractor):
# rtmp download
'skip_download': True,
},
- }
+ }, {
+ # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/
+ 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT',
+ 'info_dict': {
+ 'id': '22d_qsQ6MIRT',
+ 'ext': 'flv',
+ 'description': 'md5:ac330c9258c04f9d7512cf26b9595409',
+ 'title': 'Tesla Model S: A second step towards a cleaner motoring future',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
+ 'info_dict': {
+ 'id': 'yMBg9E8KFxZD',
+ 'ext': 'mp4',
+ 'description': 'md5:644ad9188d655b742f942bf2e06b002d',
+ 'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+ }
+ }, {
+ 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701',
+ 'md5': '734f3790fb5fc4903da391beeebc4836',
+ 'info_dict': {
+ 'id': 'tdy_or_siri_150701',
+ 'ext': 'mp4',
+ 'title': 'iPhone Siri’s sassy response to a math question has people talking',
+ 'description': 'md5:a565d1deadd5086f3331d57298ec6333',
+ 'duration': 83.0,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1435752600,
+ 'upload_date': '20150701',
+ 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"],
+ },
+ }, {
+ # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
+ # geo-restricted (US), HLS encrypted with AES-128
+ 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
+ flags = '10' if include_qs else '00'
+ expiration_date = '%x' % (int(time.time()) + life)
+
+ def str_to_hex(str):
+ return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
+
+ def hex_to_str(hex):
+ return binascii.a2b_hex(hex)
+
+ relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0]
+ clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path))
+ checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
+ sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
+ return '%s&sig=%s' % (url, sig)
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
mobj = re.match(self._VALID_URL, url)
+ provider_id = mobj.group('provider_id')
video_id = mobj.group('id')
- if mobj.group('config'):
+
+ if not provider_id:
+ provider_id = 'dJ5BDC'
+
+ path = provider_id
+ if mobj.group('media'):
+ path += '/media'
+ path += '/' + video_id
+
+ qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ if 'guid' in qs_dict:
+ webpage = self._download_webpage(url, video_id)
+ scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
+ feed_id = None
+ # feed id usually locates in the last script.
+ # Seems there's no pattern for the interested script filename, so
+ # I try one by one
+ for script in reversed(scripts):
+ feed_script = self._download_webpage(
+ self._proto_relative_url(script, 'http:'),
+ video_id, 'Downloading feed script')
+ feed_id = self._search_regex(
+ r'defaultFeedId\s*:\s*"([^"]+)"', feed_script,
+ 'default feed id', default=None)
+ if feed_id is not None:
+ break
+ if feed_id is None:
+ raise ExtractorError('Unable to find feed id')
+ return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % (
+ provider_id, feed_id, qs_dict['guid'][0]))
+
+ if smuggled_data.get('force_smil_url', False):
+ smil_url = url
+ # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385)
+ elif '/guid/' in url:
+ webpage = self._download_webpage(url, video_id)
+ smil_url = self._search_regex(
+ r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
+ webpage, 'smil url', group='url')
+ path = self._search_regex(
+ r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path')
+ smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL'
+ elif mobj.group('config'):
config_url = url + '&form=json'
config_url = config_url.replace('swf/', 'config/')
config_url = config_url.replace('onsite/', 'onsite/config/')
config = self._download_json(config_url, video_id, 'Downloading config')
- smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
+ if 'releaseUrl' in config:
+ release_url = config['releaseUrl']
+ else:
+ release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+ smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m'
else:
- smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
- 'format=smil&mbr=true'.format(video_id))
+ smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
- meta = self._download_xml(smil_url, video_id)
- try:
- error_msg = next(
- n.attrib['abstract']
- for n in meta.findall(_x('.//smil:ref'))
- if n.attrib.get('title') == 'Geographic Restriction')
- except StopIteration:
- pass
- else:
- raise ExtractorError(error_msg, expected=True)
+ sig = smuggled_data.get('sig')
+ if sig:
+ smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
- info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id)
- info_json = self._download_webpage(info_url, video_id)
- info = json.loads(info_json)
+ formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
- subtitles = {}
- captions = info.get('captions')
- if isinstance(captions, list):
- for caption in captions:
- lang, src = caption.get('lang'), caption.get('src')
- if lang and src:
- subtitles[lang] = src
+ ret = self.get_metadata(path, video_id)
+ combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
+ ret.update({
+ 'id': video_id,
+ 'formats': formats,
+ 'subtitles': combined_subtitles,
+ })
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
+ return ret
- subtitles = self.extract_subtitles(video_id, subtitles)
- head = meta.find(_x('smil:head'))
- body = meta.find(_x('smil:body'))
+class ThePlatformFeedIE(ThePlatformBaseIE):
+ _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s'
+ _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)'
+ _TEST = {
+ # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
+ 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
+ 'md5': '22d2b84f058d3586efcd99e57d59d314',
+ 'info_dict': {
+ 'id': 'n_hardball_5biden_140207',
+ 'ext': 'mp4',
+ 'title': 'The Biden factor: will Joe run in 2016?',
+ 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20140208',
+ 'timestamp': 1391824260,
+ 'duration': 467.0,
+ 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
+ },
+ }
- f4m_node = body.find(_x('smil:seq//smil:video'))
- if f4m_node is not None and '.f4m' in f4m_node.attrib['src']:
- f4m_url = f4m_node.attrib['src']
- if 'manifest.f4m?' not in f4m_url:
- f4m_url += '?'
- # the parameters are from syfy.com, other sites may use others,
- # they also work for nbc.com
- f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
- formats = self._extract_f4m_formats(f4m_url, video_id)
- else:
- formats = []
- switch = body.find(_x('smil:switch'))
- if switch is not None:
- base_url = head.find(_x('smil:meta')).attrib['base']
- for f in switch.findall(_x('smil:video')):
- attr = f.attrib
- width = int(attr['width'])
- height = int(attr['height'])
- vbr = int(attr['system-bitrate']) // 1000
- format_id = '%dx%d_%dk' % (width, height, vbr)
- formats.append({
- 'format_id': format_id,
- 'url': base_url,
- 'play_path': 'mp4:' + attr['src'],
- 'ext': 'flv',
- 'width': width,
- 'height': height,
- 'vbr': vbr,
- })
- else:
- switch = body.find(_x('smil:seq//smil:switch'))
- for f in switch.findall(_x('smil:video')):
- attr = f.attrib
- vbr = int(attr['system-bitrate']) // 1000
- ext = determine_ext(attr['src'])
- if ext == 'once':
- ext = 'mp4'
- formats.append({
- 'format_id': compat_str(vbr),
- 'url': attr['src'],
- 'vbr': vbr,
- 'ext': ext,
- })
- self._sort_formats(formats)
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
- return {
+ video_id = mobj.group('id')
+ provider_id = mobj.group('provider_id')
+ feed_id = mobj.group('feed_id')
+
+ real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id)
+ feed = self._download_json(real_url, video_id)
+ entry = feed['entries'][0]
+
+ formats = []
+ subtitles = {}
+ first_video_id = None
+ duration = None
+ for item in entry['media$content']:
+ smil_url = item['plfile$url'] + '&format=SMIL&Tracking=true&Embedded=true&formats=MPEG4,F4M'
+ cur_video_id = url_basename(smil_url)
+ if first_video_id is None:
+ first_video_id = cur_video_id
+ duration = float_or_none(item.get('plfile$duration'))
+ cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
+ formats.extend(cur_formats)
+ subtitles = self._merge_subtitles(subtitles, cur_subtitles)
+
+ self._sort_formats(formats)
+
+ thumbnails = [{
+ 'url': thumbnail['plfile$url'],
+ 'width': int_or_none(thumbnail.get('plfile$width')),
+ 'height': int_or_none(thumbnail.get('plfile$height')),
+ } for thumbnail in entry.get('media$thumbnails', [])]
+
+ timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
+ categories = [item['media$name'] for item in entry.get('media$categories', [])]
+
+ ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+ subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
+ ret.update({
'id': video_id,
- 'title': info['title'],
- 'subtitles': subtitles,
'formats': formats,
- 'description': info['description'],
- 'thumbnail': info['defaultThumbnailUrl'],
- 'duration': info['duration'] // 1000,
- }
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'categories': categories,
+ })
+
+ return ret
diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py
index a77c6a2fc..5d09eb9a8 100644
--- a/youtube_dl/extractor/thesixtyone.py
+++ b/youtube_dl/extractor/thesixtyone.py
@@ -1,9 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
from ..utils import unified_strdate
@@ -17,7 +14,7 @@ class TheSixtyOneIE(InfoExtractor):
song
)/(?P<id>[A-Za-z0-9]+)/?$'''
_SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
- _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}.thesixtyone.com/thesixtyone_production/audio/{0:}_stream'
+ _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'
_THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
_TESTS = [
{
@@ -70,14 +67,19 @@ class TheSixtyOneIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- song_id = mobj.group('id')
+ song_id = self._match_id(url)
webpage = self._download_webpage(
self._SONG_URL_TEMPLATE.format(song_id), song_id)
- song_data = json.loads(self._search_regex(
- r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'))
+ song_data = self._parse_json(self._search_regex(
+ r'"%s":\s(\{.*?\})' % song_id, webpage, 'song_data'), song_id)
+
+ if self._search_regex(r'(t61\.s3_audio_load\s*=\s*1\.0;)', webpage, 's3_audio_load marker', default=None):
+ song_data['audio_server'] = 's3.amazonaws.com'
+ else:
+ song_data['audio_server'] = song_data['audio_server'] + '.thesixtyone.com'
+
keys = [self._DECODE_MAP.get(s, s) for s in song_data['key']]
url = self._SONG_FILE_URL_TEMPLATE.format(
"".join(reversed(keys)), **song_data)
diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py
new file mode 100644
index 000000000..36493a5de
--- /dev/null
+++ b/youtube_dl/extractor/thisamericanlife.py
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ThisAmericanLifeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one',
+ 'md5': '8f7d2da8926298fdfca2ee37764c11ce',
+ 'info_dict': {
+ 'id': '487',
+ 'ext': 'm4a',
+ 'title': '487: Harper High School, Part One',
+ 'description': 'md5:ee40bdf3fb96174a9027f76dbecea655',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.thisamericanlife.org/play_full.php?play=487',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id)
+
+ return {
+ 'id': video_id,
+ 'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id),
+ 'protocol': 'm3u8_native',
+ 'ext': 'm4a',
+ 'acodec': 'aac',
+ 'vcodec': 'none',
+ 'abr': 64,
+ 'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True),
+ 'description': self._html_search_meta(r'description', webpage, 'description'),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index 9f9e388c5..d6d038a8d 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .brightcove import BrightcoveIE
+from .brightcove import BrightcoveLegacyIE
from .discovery import DiscoveryIE
from ..compat import compat_urlparse
@@ -12,17 +12,22 @@ class TlcIE(DiscoveryIE):
IE_NAME = 'tlc.com'
_VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
- _TEST = {
+ # DiscoveryIE has _TESTS
+ _TESTS = [{
'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
- 'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
'info_dict': {
- 'id': '853232',
+ 'id': '104493',
'ext': 'mp4',
- 'title': 'Cake Boss: Too Big to Fly',
+ 'title': 'Too Big to Fly',
'description': 'Buddy has taken on a high flying task.',
'duration': 119,
+ 'timestamp': 1393365060,
+ 'upload_date': '20140225',
},
- }
+ 'params': {
+ 'skip_download': True, # requires ffmpef
+ },
+ }]
class TlcDeIE(InfoExtractor):
@@ -61,6 +66,6 @@ class TlcDeIE(InfoExtractor):
return {
'_type': 'url',
- 'url': BrightcoveIE._extract_brightcove_url(iframe),
- 'ie': BrightcoveIE.ie_key(),
+ 'url': BrightcoveLegacyIE._extract_brightcove_url(iframe),
+ 'ie': BrightcoveLegacyIE.ie_key(),
}
diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py
index c5c6fdc51..7dbe68b5c 100644
--- a/youtube_dl/extractor/tmz.py
+++ b/youtube_dl/extractor/tmz.py
@@ -30,3 +30,31 @@ class TMZIE(InfoExtractor):
'description': self._og_search_description(webpage),
'thumbnail': self._html_search_meta('ThumbURL', webpage),
}
+
+
+class TMZArticleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?'
+ _TEST = {
+ 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert',
+ 'md5': 'e482a414a38db73087450e3a6ce69d00',
+ 'info_dict': {
+ 'id': '0_6snoelag',
+ 'ext': 'mp4',
+ 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake',
+ 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ embedded_video_info_str = self._html_search_regex(
+ r'tmzVideoEmbedV2\("([^)]+)"\);', webpage, 'embedded video info')
+
+ embedded_video_info = self._parse_json(
+ embedded_video_info_str, video_id,
+ transform_source=lambda s: s.replace('\\', ''))
+
+ return self.url_result(
+ 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'])
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index d48cbbf14..49516abca 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -3,33 +3,70 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- parse_duration,
fix_xml_ampersands,
+ float_or_none,
+ int_or_none,
+ parse_duration,
+ str_to_int,
+ xpath_text,
)
-class TNAFlixIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+class TNAFlixNetworkBaseIE(InfoExtractor):
+ # May be overridden in descendants if necessary
+ _CONFIG_REGEX = [
+ r'flashvars\.config\s*=\s*escape\("([^"]+)"',
+ r'<input[^>]+name="config\d?" value="([^"]+)"',
+ ]
+ _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"'
+ _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"'
+ _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"'
+ _VIEW_COUNT_REGEX = None
+ _COMMENT_COUNT_REGEX = None
+ _AVERAGE_RATING_REGEX = None
+ _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>'
- _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
- _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
- _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
+ def _extract_thumbnails(self, flix_xml):
- _TEST = {
- 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
- 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
- 'info_dict': {
- 'id': '553878',
- 'display_id': 'Carmella-Decesare-striptease',
- 'ext': 'mp4',
- 'title': 'Carmella Decesare - striptease',
- 'description': '',
- 'thumbnail': 're:https?://.*\.jpg$',
- 'duration': 91,
- 'age_limit': 18,
- }
- }
+ def get_child(elem, names):
+ for name in names:
+ child = elem.find(name)
+ if child is not None:
+ return child
+
+ timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage'])
+ if timeline is None:
+ return
+
+ pattern_el = get_child(timeline, ['imagePattern', 'pattern'])
+ if pattern_el is None or not pattern_el.text:
+ return
+
+ first_el = get_child(timeline, ['imageFirst', 'first'])
+ last_el = get_child(timeline, ['imageLast', 'last'])
+ if first_el is None or last_el is None:
+ return
+
+ first_text = first_el.text
+ last_text = last_el.text
+ if not first_text.isdigit() or not last_text.isdigit():
+ return
+
+ first = int(first_text)
+ last = int(last_text)
+ if first > last:
+ return
+
+ width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width'))
+ height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height'))
+
+ return [{
+ 'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'),
+ 'width': width,
+ 'height': height,
+ } for i in range(first, last + 1)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -38,47 +75,195 @@ class TNAFlixIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- title = self._html_search_regex(
- self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
- description = self._html_search_regex(
- self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='')
-
- age_limit = self._rta_search(webpage)
-
- duration = self._html_search_meta('duration', webpage, 'duration', default=None)
- if duration:
- duration = parse_duration(duration[1:])
-
cfg_url = self._proto_relative_url(self._html_search_regex(
self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
cfg_xml = self._download_xml(
- cfg_url, display_id, note='Downloading metadata',
+ cfg_url, display_id, 'Downloading metadata',
transform_source=fix_xml_ampersands)
- thumbnail = cfg_xml.find('./startThumb').text
-
formats = []
+
+ def extract_video_url(vl):
+ return re.sub('speed=\d+', 'speed=', vl.text)
+
+ video_link = cfg_xml.find('./videoLink')
+ if video_link is not None:
+ formats.append({
+ 'url': extract_video_url(video_link),
+ 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'),
+ })
+
for item in cfg_xml.findall('./quality/item'):
- video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text)
- format_id = item.find('res').text
- fmt = {
- 'url': video_url,
+ video_link = item.find('./videoLink')
+ if video_link is None:
+ continue
+ res = item.find('res')
+ format_id = None if res is None else res.text
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_id, 'height', default=None))
+ formats.append({
+ 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'),
'format_id': format_id,
- }
- m = re.search(r'^(\d+)', format_id)
- if m:
- fmt['height'] = int(m.group(1))
- formats.append(fmt)
+ 'height': height,
+ })
+
self._sort_formats(formats)
+ thumbnail = self._proto_relative_url(
+ xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:')
+ thumbnails = self._extract_thumbnails(cfg_xml)
+
+ title = self._html_search_regex(
+ self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
+
+ age_limit = self._rta_search(webpage)
+
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', default=None))
+
+ def extract_field(pattern, name):
+ return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None
+
+ description = extract_field(self._DESCRIPTION_REGEX, 'description')
+ uploader = extract_field(self._UPLOADER_REGEX, 'uploader')
+ view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count'))
+ comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count'))
+ average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating'))
+
+ categories_str = extract_field(self._CATEGORIES_REGEX, 'categories')
+ categories = categories_str.split(', ') if categories_str is not None else []
+
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'duration': duration,
'age_limit': age_limit,
+ 'uploader': uploader,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'average_rating': average_rating,
+ 'categories': categories,
'formats': formats,
}
+
+
+class TNAFlixIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
+
+ _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
+ _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
+ _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div'
+
+ _TESTS = [{
+ # anonymous uploader, no categories
+ 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+ 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+ 'info_dict': {
+ 'id': '553878',
+ 'display_id': 'Carmella-Decesare-striptease',
+ 'ext': 'mp4',
+ 'title': 'Carmella Decesare - striptease',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 91,
+ 'age_limit': 18,
+ 'uploader': 'Anonymous',
+ 'categories': [],
+ }
+ }, {
+ # non-anonymous uploader, categories
+ 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538',
+ 'md5': '0f5d4d490dbfd117b8607054248a07c0',
+ 'info_dict': {
+ 'id': '6538',
+ 'display_id': 'Educational-xxx-video',
+ 'ext': 'mp4',
+ 'title': 'Educational xxx video',
+ 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 164,
+ 'age_limit': 18,
+ 'uploader': 'bobwhite39',
+ 'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'],
+ }
+ }, {
+ 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+ 'only_matching': True,
+ }]
+
+
+class EMPFlixIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
+
+ _UPLOADER_REGEX = r'<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)</li>'
+
+ _TESTS = [{
+ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+ 'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+ 'info_dict': {
+ 'id': '33051',
+ 'display_id': 'Amateur-Finger-Fuck',
+ 'ext': 'mp4',
+ 'title': 'Amateur Finger Fuck',
+ 'description': 'Amateur solo finger fucking.',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'duration': 83,
+ 'age_limit': 18,
+ 'uploader': 'cwbike',
+ 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'],
+ }
+ }, {
+ 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+ 'only_matching': True,
+ }]
+
+
+class MovieFapIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html'
+
+ _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>'
+ _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>'
+ _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>'
+ _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>'
+
+ _TESTS = [{
+ # normal, multi-format video
+ 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
+ 'md5': '26624b4e2523051b550067d547615906',
+ 'info_dict': {
+ 'id': 'be9867c9416c19f54a4a',
+ 'display_id': 'experienced-milf-amazing-handjob',
+ 'ext': 'mp4',
+ 'title': 'Experienced MILF Amazing Handjob',
+ 'description': 'Experienced MILF giving an Amazing Handjob',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ 'uploader': 'darvinfred06',
+ 'view_count': int,
+ 'comment_count': int,
+ 'average_rating': float,
+ 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'],
+ }
+ }, {
+ # quirky single-format case where the extension is given as fid, but the video is really an flv
+ 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
+ 'md5': 'fa56683e291fc80635907168a743c9ad',
+ 'info_dict': {
+ 'id': 'e5da0d3edce5404418f5',
+ 'display_id': 'jeune-couple-russe',
+ 'ext': 'flv',
+ 'title': 'Jeune Couple Russe',
+ 'description': 'Amateur',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ 'uploader': 'whiskeyjar',
+ 'view_count': int,
+ 'comment_count': int,
+ 'average_rating': float,
+ 'categories': ['Amateur', 'Teen'],
+ }
+ }]
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index d73ad3762..46ef61ff5 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -4,12 +4,10 @@ import json
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse_urlparse
from ..utils import (
int_or_none,
+ sanitized_Request,
str_to_int,
)
from ..aes import aes_decrypt_text
@@ -42,12 +40,12 @@ class Tube8IE(InfoExtractor):
video_id = mobj.group('id')
display_id = mobj.group('display_id')
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, display_id)
flashvars = json.loads(self._html_search_regex(
- r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+ r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'))
video_url = flashvars['video_url']
if flashvars.get('encrypted') is True:
@@ -58,19 +56,19 @@ class Tube8IE(InfoExtractor):
thumbnail = flashvars.get('image_url')
title = self._html_search_regex(
- r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+ r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
description = self._html_search_regex(
- r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+ r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
uploader = self._html_search_regex(
- r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+ r'<span class="username">\s*(.+?)\s*<',
webpage, 'uploader', fatal=False)
like_count = int_or_none(self._html_search_regex(
- r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+ r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
dislike_count = int_or_none(self._html_search_regex(
- r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+ r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
view_count = self._html_search_regex(
- r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+ r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False)
if view_count:
view_count = str_to_int(view_count)
comment_count = self._html_search_regex(
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
new file mode 100644
index 000000000..6d78b5dfe
--- /dev/null
+++ b/youtube_dl/extractor/tubitv.py
@@ -0,0 +1,80 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ sanitized_Request,
+)
+
+
+class TubiTvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+ _LOGIN_URL = 'http://tubitv.com/login'
+ _NETRC_MACHINE = 'tubitv'
+ _TEST = {
+ 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+ 'info_dict': {
+ 'id': '54411',
+ 'ext': 'mp4',
+ 'title': 'The Kitchen Musical - EP01',
+ 'thumbnail': 're:^https?://.*\.png$',
+ 'description': 'md5:37532716166069b353e8866e71fefae7',
+ 'duration': 2407,
+ },
+ 'params': {
+ 'skip_download': 'HLS download',
+ },
+ }
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ form_data = {
+ 'username': username,
+ 'password': password,
+ }
+ payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+ request = sanitized_Request(self._LOGIN_URL, payload)
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ login_page = self._download_webpage(
+ request, None, False, 'Wrong login info')
+ if not re.search(r'id="tubi-logout"', login_page):
+ raise ExtractorError(
+ 'Login failed (invalid username/password)', expected=True)
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
+ self.raise_login_required('This video requires login')
+
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration'))
+
+ apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
+ m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index c89de5ba4..5f7ac4b35 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -2,14 +2,12 @@
from __future__ import unicode_literals
-import re
-import json
-
from .common import InfoExtractor
+from ..compat import compat_str
class TudouIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
+ _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
_TESTS = [{
'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
'md5': '140a49ed444bd22f93330985d8475fcb',
@@ -27,35 +25,41 @@ class TudouIE(InfoExtractor):
'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
'thumbnail': 're:^https?://.*\.jpg$',
}
+ }, {
+ 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html',
+ 'only_matching': True,
}]
- def _url_for_id(self, id, quality=None):
- info_url = "http://v2.tudou.com/f?id=" + str(id)
+ _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
+
+ def _url_for_id(self, video_id, quality=None):
+ info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
if quality:
info_url += '&hd' + quality
- webpage = self._download_webpage(info_url, id, "Opening the info webpage")
- final_url = self._html_search_regex('>(.+?)</f>', webpage, 'video url')
+ xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")
+ final_url = xml_data.text
return final_url
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage)
- if m and m.group(1):
- return {
- '_type': 'url',
- 'url': 'youku:' + m.group(1),
- 'ie_key': 'Youku'
- }
+ youku_vcode = self._search_regex(
+ r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None)
+ if youku_vcode:
+ return self.url_result('youku:' + youku_vcode, ie='Youku')
title = self._search_regex(
- r",kw:\s*['\"](.+?)[\"']", webpage, 'title')
+ r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title')
thumbnail_url = self._search_regex(
- r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False)
+ r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False)
+
+ player_url = self._search_regex(
+ r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]',
+ webpage, 'player URL', default=self._PLAYER_URL)
- segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
- segments = json.loads(segs_json)
+ segments = self._parse_json(self._search_regex(
+ r'segs: \'([^\']+)\'', webpage, 'segments'), video_id)
# It looks like the keys are the arguments that have to be passed as
# the hd field in the request url, we pick the higher
# Also, filter non-number qualities (see issue #3643).
@@ -76,6 +80,9 @@ class TudouIE(InfoExtractor):
'ext': ext,
'title': title,
'thumbnail': thumbnail_url,
+ 'http_headers': {
+ 'Referer': player_url,
+ },
}
result.append(part_info)
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 2a1ae5a71..4f844706d 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import int_or_none
class TumblrIE(InfoExtractor):
@@ -28,6 +29,44 @@ class TumblrIE(InfoExtractor):
'description': 'md5:dba62ac8639482759c8eb10ce474586a',
'thumbnail': 're:http://.*\.jpg',
}
+ }, {
+ 'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video',
+ 'md5': '7ae503065ad150122dc3089f8cf1546c',
+ 'info_dict': {
+ 'id': '130323439814',
+ 'ext': 'mp4',
+ 'title': 'HD Video Testing \u2014 Test description for my HD video',
+ 'description': 'md5:97cc3ab5fcd27ee4af6356701541319c',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ 'params': {
+ 'format': 'hd',
+ },
+ }, {
+ 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+ 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+ 'info_dict': {
+ 'id': 'Wmur',
+ 'ext': 'mp4',
+ 'title': 'naked smoking & stretching',
+ 'upload_date': '20150506',
+ 'timestamp': 1430931613,
+ 'age_limit': 18,
+ 'uploader_id': '1638622',
+ 'uploader': 'naked-yogi',
+ },
+ 'add_ie': ['Vidme'],
+ }, {
+ 'url': 'http://camdamage.tumblr.com/post/98846056295/',
+ 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+ 'info_dict': {
+ 'id': '105463834',
+ 'ext': 'mp4',
+ 'title': 'Cam Damage-HD 720p',
+ 'uploader': 'John Moyer',
+ 'uploader_id': 'user32021558',
+ },
+ 'add_ie': ['Vimeo'],
}]
def _real_extract(self, url):
@@ -36,14 +75,46 @@ class TumblrIE(InfoExtractor):
blog = m_url.group('blog_name')
url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
- webpage = self._download_webpage(url, video_id)
+ webpage, urlh = self._download_webpage_handle(url, video_id)
iframe_url = self._search_regex(
r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
- webpage, 'iframe url')
- iframe = self._download_webpage(iframe_url, video_id)
- video_url = self._search_regex(r'<source src="([^"]+)"',
- iframe, 'video url')
+ webpage, 'iframe url', default=None)
+ if iframe_url is None:
+ return self.url_result(urlh.geturl(), 'Generic')
+
+ iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page')
+
+ duration = None
+ sources = []
+
+ sd_url = self._search_regex(
+ r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe,
+ 'sd video url', default=None, group='url')
+ if sd_url:
+ sources.append((sd_url, 'sd'))
+
+ options = self._parse_json(
+ self._search_regex(
+ r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe,
+ 'hd video url', default='', group='options'),
+ video_id, fatal=False)
+ if options:
+ duration = int_or_none(options.get('duration'))
+ hd_url = options.get('hdUrl')
+ if hd_url:
+ sources.append((hd_url, 'hd'))
+
+ formats = [{
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ 'height': int_or_none(self._search_regex(
+ r'/(\d{3,4})$', video_url, 'height', default=None)),
+ 'quality': quality,
+ } for quality, (video_url, format_id) in enumerate(sources)]
+
+ self._sort_formats(formats)
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
@@ -53,9 +124,9 @@ class TumblrIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'mp4',
'title': video_title,
- 'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage, default=None),
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py
index 29703a8a9..7ae63a499 100644
--- a/youtube_dl/extractor/turbo.py
+++ b/youtube_dl/extractor/turbo.py
@@ -23,7 +23,7 @@ class TurboIE(InfoExtractor):
'ext': 'mp4',
'duration': 3715,
'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
- 'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+ 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...',
'thumbnail': 're:^https?://.*\.jpg$',
}
}
@@ -42,7 +42,7 @@ class TurboIE(InfoExtractor):
title = xpath_text(item, './title', 'title')
duration = int_or_none(xpath_text(item, './durate', 'duration'))
thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
- description = self._og_search_description(webpage)
+ description = self._html_search_meta('description', webpage)
formats = []
get_quality = qualities(['3g', 'sd', 'hq'])
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
index 4de0aac52..822372ea1 100644
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -10,10 +10,10 @@ class TutvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'
_TEST = {
'url': 'http://tu.tv/videos/robots-futbolistas',
- 'md5': '627c7c124ac2a9b5ab6addb94e0e65f7',
+ 'md5': '0cd9e28ad270488911b0d2a72323395d',
'info_dict': {
'id': '2973058',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Robots futbolistas',
},
}
@@ -26,7 +26,7 @@ class TutvIE(InfoExtractor):
data_content = self._download_webpage(
'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
- video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
+ video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8')
return {
'id': internal_id,
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
new file mode 100644
index 000000000..fa338b936
--- /dev/null
+++ b/youtube_dl/extractor/tv2.py
@@ -0,0 +1,126 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ float_or_none,
+ parse_iso8601,
+ remove_end,
+)
+
+
+class TV2IE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.tv2.no/v/916509/',
+ 'md5': '9cb9e3410b18b515d71892f27856e9b1',
+ 'info_dict': {
+ 'id': '916509',
+ 'ext': 'flv',
+ 'title': 'Se Gryttens hyllest av Steven Gerrard',
+ 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
+ 'timestamp': 1431715610,
+ 'upload_date': '20150515',
+ 'duration': 156.967,
+ 'view_count': int,
+ 'categories': list,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ formats = []
+ format_urls = []
+ for protocol in ('HDS', 'HLS'):
+ data = self._download_json(
+ 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol),
+ video_id, 'Downloading play JSON')['playback']
+ for item in data['items']['item']:
+ video_url = item.get('url')
+ if not video_url or video_url in format_urls:
+ continue
+ format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+ if not self._is_valid_url(video_url, video_id, format_id):
+ continue
+ format_urls.append(video_url)
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id=format_id))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=format_id))
+ elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
+ pass
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'tbr': int_or_none(item.get('bitrate')),
+ 'filesize': int_or_none(item.get('fileSize')),
+ })
+ self._sort_formats(formats)
+
+ asset = self._download_json(
+ 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id,
+ video_id, 'Downloading metadata JSON')['asset']
+
+ title = asset['title']
+ description = asset.get('description')
+ timestamp = parse_iso8601(asset.get('createTime'))
+ duration = float_or_none(asset.get('accurateDuration') or asset.get('duration'))
+ view_count = int_or_none(asset.get('views'))
+ categories = asset.get('keywords', '').split(',')
+
+ thumbnails = [{
+ 'id': thumbnail.get('@type'),
+ 'url': thumbnail.get('url'),
+ } for _, thumbnail in asset.get('imageVersions', {}).items()]
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'categories': categories,
+ 'formats': formats,
+ }
+
+
+class TV2ArticleIE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
+ 'info_dict': {
+ 'id': '6930542',
+ 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret',
+ 'description': 'md5:339573779d3eea3542ffe12006190954',
+ },
+ 'playlist_count': 2,
+ }, {
+ 'url': 'http://www.tv2.no/a/6930542',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2')
+ for video_id in re.findall(r'data-assetid="(\d+)"', webpage)]
+
+ title = remove_end(self._og_search_title(webpage), ' - TV2.no')
+ description = remove_end(self._og_search_description(webpage), ' - TV2.no')
+
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py
new file mode 100644
index 000000000..1c4b6d635
--- /dev/null
+++ b/youtube_dl/extractor/tv4.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ parse_iso8601,
+)
+
+
+class TV4IE(InfoExtractor):
+ IE_DESC = 'tv4.se and tv4play.se'
+ _VALID_URL = r'''(?x)https?://(?:www\.)?
+ (?:
+ tv4\.se/(?:[^/]+)/klipp/(?:.*)-|
+ tv4play\.se/
+ (?:
+ (?:program|barn)/(?:[^\?]+)\?video_id=|
+ iframe/video/|
+ film/|
+ sport/|
+ )
+ )(?P<id>[0-9]+)'''
+ _TESTS = [
+ {
+ 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650',
+ 'md5': '909d6454b87b10a25aa04c4bdd416a9b',
+ 'info_dict': {
+ 'id': '2491650',
+ 'ext': 'mp4',
+ 'title': 'Kalla Fakta 5 (english subtitles)',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': int,
+ 'upload_date': '20131125',
+ },
+ },
+ {
+ 'url': 'http://www.tv4play.se/iframe/video/3054113',
+ 'md5': '77f851c55139ffe0ebd41b6a5552489b',
+ 'info_dict': {
+ 'id': '3054113',
+ 'ext': 'mp4',
+ 'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.',
+ 'timestamp': int,
+ 'upload_date': '20150130',
+ },
+ },
+ {
+ 'url': 'http://www.tv4play.se/sport/3060959',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.tv4play.se/film/2378136',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._download_json(
+ 'http://www.tv4play.se/player/assets/%s.json' % video_id, video_id, 'Downloading video info JSON')
+
+ # If is_geo_restricted is true, it doesn't neceserally mean we can't download it
+ if info['is_geo_restricted']:
+ self.report_warning('This content might not be available in your country due to licensing restrictions.')
+ if info['requires_subscription']:
+ raise ExtractorError('This content requires subscription.', expected=True)
+
+ sources_data = self._download_json(
+ 'https://prima.tv4play.se/api/web/asset/%s/play.json?protocol=http&videoFormat=MP4' % video_id, video_id, 'Downloading sources JSON')
+ sources = sources_data['playback']
+
+ formats = []
+ for item in sources.get('items', {}).get('item', []):
+ ext, bitrate = item['mediaFormat'], item['bitrate']
+ formats.append({
+ 'format_id': '%s_%s' % (ext, bitrate),
+ 'tbr': bitrate,
+ 'ext': ext,
+ 'url': item['url'],
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'formats': formats,
+ 'description': info.get('description'),
+ 'timestamp': parse_iso8601(info.get('broadcast_date_time')),
+ 'duration': info.get('duration'),
+ 'thumbnail': info.get('image'),
+ 'is_live': sources.get('live'),
+ }
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py
new file mode 100644
index 000000000..3a4f393fc
--- /dev/null
+++ b/youtube_dl/extractor/tvc.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ int_or_none,
+)
+
+
+class TVCIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
+ 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
+ 'info_dict': {
+ 'id': '74622',
+ 'ext': 'mp4',
+ 'title': 'События. "События". Эфир от 22.05.2015 14:30',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1122,
+ },
+ }
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://www.tvc.ru/video/json/id/%s' % video_id, video_id)
+
+ formats = []
+ for info in video.get('path', {}).get('quality', []):
+ video_url = info.get('url')
+ if not video_url:
+ continue
+ format_id = self._search_regex(
+ r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url,
+ 'format id', default=None)
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'width': int_or_none(info.get('width')),
+ 'height': int_or_none(info.get('height')),
+ 'tbr': int_or_none(info.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video['title'],
+ 'thumbnail': video.get('picture'),
+ 'duration': int_or_none(video.get('duration')),
+ 'formats': formats,
+ }
+
+
+class TVCArticleIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+ _TESTS = [{
+ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
+ 'info_dict': {
+ 'id': '74622',
+ 'ext': 'mp4',
+ 'title': 'События. "События". Эфир от 22.05.2015 14:30',
+ 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1122,
+ },
+ }, {
+ 'url': 'http://www.tvc.ru/news/show/id/69944',
+ 'info_dict': {
+ 'id': '75399',
+ 'ext': 'mp4',
+ 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках',
+ 'description': 'md5:f2098f71e21f309e89f69b525fd9846e',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 278,
+ },
+ }, {
+ 'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#',
+ 'info_dict': {
+ 'id': '2185',
+ 'ext': 'mp4',
+ 'title': 'Ещё не поздно. Эфир от 03.08.2013',
+ 'description': 'md5:51fae9f3f8cfe67abce014e428e5b027',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 3316,
+ },
+ }]
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, self._match_id(url))
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'TVC',
+ 'url': self._og_search_video_url(webpage),
+ 'title': clean_html(self._og_search_title(webpage)),
+ 'description': clean_html(self._og_search_description(webpage)),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index 102362b29..dc3a8334a 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -5,7 +5,9 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
float_or_none,
+ int_or_none,
parse_age_limit,
)
@@ -24,22 +26,24 @@ class TvigleIE(InfoExtractor):
'display_id': 'sokrat',
'ext': 'flv',
'title': 'Сократ',
- 'description': 'md5:a05bd01be310074d5833efc6743be95e',
+ 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
'duration': 6586,
- 'age_limit': 0,
+ 'age_limit': 12,
},
+ 'skip': 'georestricted',
},
{
'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
- 'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574',
+ 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
'info_dict': {
'id': '5142516',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
'description': 'md5:027f7dc872948f14c96d19b4178428a4',
'duration': 186.080,
'age_limit': 0,
},
+ 'skip': 'georestricted',
}, {
'url': 'https://cloud.tvigle.ru/video/5267604/',
'only_matching': True,
@@ -54,7 +58,7 @@ class TvigleIE(InfoExtractor):
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
- r'<li class="video-preview current_playing" id="(\d+)">',
+ r'class="video-preview current_playing" id="(\d+)">',
webpage, 'video id')
video_data = self._download_json(
@@ -62,21 +66,34 @@ class TvigleIE(InfoExtractor):
item = video_data['playlist']['items'][0]
+ videos = item.get('videos')
+
+ error_message = item.get('errorMessage')
+ if not videos and error_message:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_message), expected=True)
+
title = item['title']
- description = item['description']
- thumbnail = item['thumbnail']
+ description = item.get('description')
+ thumbnail = item.get('thumbnail')
duration = float_or_none(item.get('durationMilliseconds'), 1000)
age_limit = parse_age_limit(item.get('ageRestrictions'))
formats = []
for vcodec, fmts in item['videos'].items():
- for quality, video_url in fmts.items():
+ for format_id, video_url in fmts.items():
+ if format_id == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=vcodec))
+ continue
+ height = self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)
formats.append({
'url': video_url,
- 'format_id': '%s-%s' % (vcodec, quality),
+ 'format_id': '%s-%s' % (vcodec, format_id),
'vcodec': vcodec,
- 'height': int(quality[:-1]),
- 'filesize': item['video_files_size'][vcodec][quality],
+ 'height': int_or_none(height),
+ 'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index 9a53a3c74..b4683de54 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -16,6 +16,7 @@ class TVPlayIE(InfoExtractor):
_VALID_URL = r'''(?x)http://(?:www\.)?
(?:tvplay\.lv/parraides|
tv3play\.lt/programos|
+ play\.tv3\.lt/programos|
tv3play\.ee/sisu|
tv3play\.se/program|
tv6play\.se/program|
@@ -25,6 +26,7 @@ class TVPlayIE(InfoExtractor):
viasat4play\.no/programmer|
tv6play\.no/programmer|
tv3play\.dk/programmer|
+ play\.novatv\.bg/programi
)/[^/]+/(?P<id>\d+)
'''
_TESTS = [
@@ -45,7 +47,7 @@ class TVPlayIE(InfoExtractor):
},
},
{
- 'url': 'http://www.tv3play.lt/programos/moterys-meluoja-geriau/409229?autostart=true',
+ 'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true',
'info_dict': {
'id': '409229',
'ext': 'flv',
@@ -102,6 +104,7 @@ class TVPlayIE(InfoExtractor):
'duration': 1492,
'timestamp': 1330522854,
'upload_date': '20120229',
+ 'age_limit': 18,
},
'params': {
# rtmp download
@@ -172,6 +175,22 @@ class TVPlayIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
+ 'info_dict': {
+ 'id': '624952',
+ 'ext': 'flv',
+ 'title': 'Здравей, България (12.06.2015 г.) ',
+ 'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
+ 'duration': 8838,
+ 'timestamp': 1434100372,
+ 'upload_date': '20150612',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py
index c80ec15cf..f3198fb85 100644
--- a/youtube_dl/extractor/tweakers.py
+++ b/youtube_dl/extractor/tweakers.py
@@ -1,19 +1,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- xpath_text,
- xpath_with_ns,
- int_or_none,
- float_or_none,
-)
class TweakersIE(InfoExtractor):
_VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'
_TEST = {
'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html',
- 'md5': '1b5afa817403bb5baa08359dca31e6df',
+ 'md5': '3147e4ddad366f97476a93863e4557c8',
'info_dict': {
'id': '9926',
'ext': 'mp4',
@@ -25,41 +19,7 @@ class TweakersIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- playlist = self._download_xml(
- 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % video_id,
- video_id)
-
- NS_MAP = {
- 'xspf': 'http://xspf.org/ns/0/',
- 's1': 'http://static.streamone.nl/player/ns/0',
- }
-
- track = playlist.find(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP))
-
- title = xpath_text(
- track, xpath_with_ns('./xspf:title', NS_MAP), 'title')
- description = xpath_text(
- track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
- thumbnail = xpath_text(
- track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
- duration = float_or_none(
- xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'),
- 1000)
-
- formats = [{
- 'url': location.text,
- 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
- 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
- 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
- } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
+ playlist_id = self._match_id(url)
+ entries = self._extract_xspf_playlist(
+ 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id)
+ return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py
index 67e8bfea0..c1ee1decc 100644
--- a/youtube_dl/extractor/twentyfourvideo.py
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.24video.net/video/view/1044982',
- 'md5': '48dd7646775690a80447a8dca6a2df76',
+ 'md5': 'd041af8b5b4246ea466226a0d6693345',
'info_dict': {
'id': '1044982',
'ext': 'mp4',
@@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor):
webpage, 'upload date'))
uploader = self._html_search_regex(
- r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>',
+ r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
webpage, 'uploader', fatal=False)
view_count = int_or_none(self._html_search_regex(
diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py
new file mode 100644
index 000000000..d6c0ab184
--- /dev/null
+++ b/youtube_dl/extractor/twentytwotracks.py
@@ -0,0 +1,86 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+# 22Tracks regularly replace the audio tracks that can be streamed on their
+# site. The tracks usually expire after 1 months, so we can't add tests.
+
+
+class TwentyTwoTracksIE(InfoExtractor):
+ _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)'
+ IE_NAME = '22tracks:track'
+
+ _API_BASE = 'http://22tracks.com/api'
+
+ def _extract_info(self, city, genre_name, track_id=None):
+ item_id = track_id if track_id else genre_name
+
+ cities = self._download_json(
+ '%s/cities' % self._API_BASE, item_id,
+ 'Downloading cities info',
+ 'Unable to download cities info')
+ city_id = [x['id'] for x in cities if x['slug'] == city][0]
+
+ genres = self._download_json(
+ '%s/genres/%s' % (self._API_BASE, city_id), item_id,
+ 'Downloading %s genres info' % city,
+ 'Unable to download %s genres info' % city)
+ genre = [x for x in genres if x['slug'] == genre_name][0]
+ genre_id = genre['id']
+
+ tracks = self._download_json(
+ '%s/tracks/%s' % (self._API_BASE, genre_id), item_id,
+ 'Downloading %s genre tracks info' % genre_name,
+ 'Unable to download track info')
+
+ return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks]
+
+ def _get_track_url(self, filename, track_id):
+ token = self._download_json(
+ 'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename,
+ track_id, 'Downloading token', 'Unable to download token')
+ return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e'])
+
+ def _extract_track_info(self, track_info, track_id):
+ download_url = self._get_track_url(track_info['filename'], track_id)
+ title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip())
+ return {
+ 'id': track_id,
+ 'url': download_url,
+ 'ext': 'mp3',
+ 'title': title,
+ 'duration': int_or_none(track_info.get('duration')),
+ 'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created'))
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ city = mobj.group('city')
+ genre = mobj.group('genre')
+ track_id = mobj.group('id')
+
+ track_info = self._extract_info(city, genre, track_id)
+ return self._extract_track_info(track_info, track_id)
+
+
+class TwentyTwoTracksGenreIE(TwentyTwoTracksIE):
+ _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$'
+ IE_NAME = '22tracks:genre'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ city = mobj.group('city')
+ genre = mobj.group('genre')
+
+ genre_title, tracks = self._extract_info(city, genre)
+
+ entries = [
+ self._extract_track_info(track_info, track_info['id'])
+ for track_info in tracks]
+
+ return self.playlist_result(entries, genre, genre_title)
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 87290d002..69882da63 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -7,13 +7,19 @@ import random
from .common import InfoExtractor
from ..compat import (
+ compat_parse_qs,
compat_str,
compat_urllib_parse,
- compat_urllib_request,
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
+ int_or_none,
+ parse_duration,
parse_iso8601,
+ sanitized_Request,
)
@@ -22,7 +28,8 @@ class TwitchBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitch.tv'
_USHER_BASE = 'http://usher.twitch.tv'
- _LOGIN_URL = 'https://secure.twitch.tv/user/login'
+ _LOGIN_URL = 'http://www.twitch.tv/login'
+ _NETRC_MACHINE = 'twitch'
def _handle_error(self, response):
if not isinstance(response, dict):
@@ -34,7 +41,15 @@ class TwitchBaseIE(InfoExtractor):
expected=True)
def _download_json(self, url, video_id, note='Downloading JSON metadata'):
- response = super(TwitchBaseIE, self)._download_json(url, video_id, note)
+ headers = {
+ 'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2',
+ 'X-Requested-With': 'XMLHttpRequest',
+ }
+ for cookie in self._downloader.cookiejar:
+ if cookie.name == 'api_token':
+ headers['Twitch-Api-Token'] = cookie.value
+ request = sanitized_Request(url, headers=headers)
+ response = super(TwitchBaseIE, self)._download_json(request, video_id, note)
self._handle_error(response)
return response
@@ -46,35 +61,48 @@ class TwitchBaseIE(InfoExtractor):
if username is None:
return
- login_page = self._download_webpage(
+ login_page, handle = self._download_webpage_handle(
self._LOGIN_URL, None, 'Downloading login page')
- authenticity_token = self._search_regex(
- r'<input name="authenticity_token" type="hidden" value="([^"]+)"',
- login_page, 'authenticity token')
-
- login_form = {
- 'utf8': '✓'.encode('utf-8'),
- 'authenticity_token': authenticity_token,
- 'redirect_on_login': '',
- 'embed_form': 'false',
- 'mp_source_action': '',
- 'follow': '',
- 'user[login]': username,
- 'user[password]': password,
- }
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'username': username,
+ 'password': password,
+ })
+
+ redirect_url = handle.geturl()
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+ 'post url', default=redirect_url, group='url')
- request = compat_urllib_request.Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- request.add_header('Referer', self._LOGIN_URL)
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(redirect_url, post_url)
+
+ request = sanitized_Request(
+ post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8'))
+ request.add_header('Referer', redirect_url)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
- m = re.search(
- r"id=([\"'])login_error_message\1[^>]*>(?P<msg>[^<]+)", response)
- if m:
+ error_message = self._search_regex(
+ r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>',
+ response, 'error message', default=None)
+ if error_message:
raise ExtractorError(
- 'Unable to login: %s' % m.group('msg').strip(), expected=True)
+ 'Unable to login. Twitch said: %s' % error_message, expected=True)
+
+ if '>Reset your password<' in response:
+ self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit')
+
+ def _prefer_source(self, formats):
+ try:
+ source = next(f for f in formats if f['format_id'] == 'Source')
+ source['preference'] = 10
+ except StopIteration:
+ pass # No Source stream present
+ self._sort_formats(formats)
class TwitchItemBaseIE(TwitchBaseIE):
@@ -115,14 +143,14 @@ class TwitchItemBaseIE(TwitchBaseIE):
def _extract_info(self, info):
return {
'id': info['_id'],
- 'title': info['title'],
- 'description': info['description'],
- 'duration': info['length'],
- 'thumbnail': info['preview'],
- 'uploader': info['channel']['display_name'],
- 'uploader_id': info['channel']['name'],
- 'timestamp': parse_iso8601(info['recorded_at']),
- 'view_count': info['views'],
+ 'title': info.get('title') or 'Untitled Broadcast',
+ 'description': info.get('description'),
+ 'duration': int_or_none(info.get('length')),
+ 'thumbnail': info.get('preview'),
+ 'uploader': info.get('channel', {}).get('display_name'),
+ 'uploader_id': info.get('channel', {}).get('name'),
+ 'timestamp': parse_iso8601(info.get('recorded_at')),
+ 'view_count': int_or_none(info.get('views')),
}
def _real_extract(self, url):
@@ -131,7 +159,7 @@ class TwitchItemBaseIE(TwitchBaseIE):
class TwitchVideoIE(TwitchItemBaseIE):
IE_NAME = 'twitch:video'
- _VALID_URL = r'%s/[^/]+/b/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+ _VALID_URL = r'%s/[^/]+/b/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'video'
_ITEM_SHORTCUT = 'a'
@@ -147,7 +175,7 @@ class TwitchVideoIE(TwitchItemBaseIE):
class TwitchChapterIE(TwitchItemBaseIE):
IE_NAME = 'twitch:chapter'
- _VALID_URL = r'%s/[^/]+/c/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+ _VALID_URL = r'%s/[^/]+/c/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'chapter'
_ITEM_SHORTCUT = 'c'
@@ -166,41 +194,78 @@ class TwitchChapterIE(TwitchItemBaseIE):
class TwitchVodIE(TwitchItemBaseIE):
IE_NAME = 'twitch:vod'
- _VALID_URL = r'%s/[^/]+/v/(?P<id>[^/]+)' % TwitchBaseIE._VALID_URL_BASE
+ _VALID_URL = r'%s/[^/]+/v/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
_ITEM_TYPE = 'vod'
_ITEM_SHORTCUT = 'v'
- _TEST = {
- 'url': 'http://www.twitch.tv/ksptv/v/3622000',
+ _TESTS = [{
+ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',
'info_dict': {
- 'id': 'v3622000',
+ 'id': 'v6528877',
'ext': 'mp4',
- 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''',
+ 'title': 'LCK Summer Split - Week 6 Day 1',
'thumbnail': 're:^https?://.*\.jpg$',
- 'duration': 6951,
- 'timestamp': 1419028564,
- 'upload_date': '20141219',
- 'uploader': 'KSPTV',
- 'uploader_id': 'ksptv',
+ 'duration': 17208,
+ 'timestamp': 1435131709,
+ 'upload_date': '20150624',
+ 'uploader': 'Riot Games',
+ 'uploader_id': 'riotgames',
'view_count': int,
+ 'start_time': 310,
},
'params': {
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ # Untitled broadcast (title is None)
+ 'url': 'http://www.twitch.tv/belkao_o/v/11230755',
+ 'info_dict': {
+ 'id': 'v11230755',
+ 'ext': 'mp4',
+ 'title': 'Untitled Broadcast',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 1638,
+ 'timestamp': 1439746708,
+ 'upload_date': '20150816',
+ 'uploader': 'BelkAO_o',
+ 'uploader_id': 'belkao_o',
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
item_id = self._match_id(url)
+
info = self._download_info(self._ITEM_SHORTCUT, item_id)
access_token = self._download_json(
'%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
'Downloading %s access token' % self._ITEM_TYPE)
+
formats = self._extract_m3u8_formats(
- '%s/vod/%s?nauth=%s&nauthsig=%s'
- % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
+ '%s/vod/%s?%s' % (
+ self._USHER_BASE, item_id,
+ compat_urllib_parse.urlencode({
+ 'allow_source': 'true',
+ 'allow_spectre': 'true',
+ 'player': 'twitchweb',
+ 'nauth': access_token['token'],
+ 'nauthsig': access_token['sig'],
+ })),
item_id, 'mp4')
+
+ self._prefer_source(formats)
info['formats'] = formats
+
+ parsed_url = compat_urllib_parse_urlparse(url)
+ query = compat_parse_qs(parsed_url.query)
+ if 't' in query:
+ info['start_time'] = parse_duration(query['t'][0])
+
return info
@@ -295,9 +360,9 @@ class TwitchBookmarksIE(TwitchPlaylistBaseIE):
class TwitchStreamIE(TwitchBaseIE):
IE_NAME = 'twitch:stream'
- _VALID_URL = r'%s/(?P<id>[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+ _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
- _TEST = {
+ _TESTS = [{
'url': 'http://www.twitch.tv/shroomztv',
'info_dict': {
'id': '12772022048',
@@ -316,7 +381,10 @@ class TwitchStreamIE(TwitchBaseIE):
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.twitch.tv/miracle_doto#profile-0',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -331,6 +399,12 @@ class TwitchStreamIE(TwitchBaseIE):
'http://www.twitch.tv/%s/profile' % channel_id,
'TwitchProfile', channel_id)
+ # Channel name may be typed if different case than the original channel name
+ # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
+ # an invalid m3u8 URL. Working around by use of original channel name from stream
+ # JSON and fallback to lowercase if it's not available.
+ channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
+
access_token = self._download_json(
'%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id,
'Downloading channel access token')
@@ -340,14 +414,14 @@ class TwitchStreamIE(TwitchBaseIE):
'p': random.randint(1000000, 10000000),
'player': 'twitchweb',
'segment_preference': '4',
- 'sig': access_token['sig'],
- 'token': access_token['token'],
+ 'sig': access_token['sig'].encode('utf-8'),
+ 'token': access_token['token'].encode('utf-8'),
}
-
formats = self._extract_m3u8_formats(
'%s/api/channel/hls/%s.m3u8?%s'
- % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')),
+ % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),
channel_id, 'mp4')
+ self._prefer_source(formats)
view_count = stream.get('viewers')
timestamp = parse_iso8601(stream.get('created_at'))
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
new file mode 100644
index 000000000..a161f046b
--- /dev/null
+++ b/youtube_dl/extractor/twitter.py
@@ -0,0 +1,228 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ float_or_none,
+ xpath_text,
+ remove_end,
+ int_or_none,
+ ExtractorError,
+ sanitized_Request,
+)
+
+
+class TwitterCardIE(InfoExtractor):
+ IE_NAME = 'twitter:card'
+ _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+ 'md5': '4fa26a35f9d1bf4b646590ba8e84be19',
+ 'info_dict': {
+ 'id': '560070183650213889',
+ 'ext': 'mp4',
+ 'title': 'TwitterCard',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 30.033,
+ }
+ },
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
+ 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8',
+ 'info_dict': {
+ 'id': '623160978427936768',
+ 'ext': 'mp4',
+ 'title': 'TwitterCard',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 80.155,
+ },
+ },
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
+ 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814',
+ 'info_dict': {
+ 'id': 'dq4Oj5quskI',
+ 'ext': 'mp4',
+ 'title': 'Ubuntu 11.10 Overview',
+ 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/',
+ 'upload_date': '20111013',
+ 'uploader': 'OMG! Ubuntu!',
+ 'uploader_id': 'omgubuntu',
+ },
+ 'add_ie': ['Youtube'],
+ },
+ {
+ 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568',
+ 'md5': 'ab2745d0b0ce53319a534fccaa986439',
+ 'info_dict': {
+ 'id': 'iBb2x00UVlv',
+ 'ext': 'mp4',
+ 'upload_date': '20151113',
+ 'uploader_id': '1189339351084113920',
+ 'uploader': '@ArsenalTerje',
+ 'title': 'Vine by @ArsenalTerje',
+ },
+ 'add_ie': ['Vine'],
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ # Different formats served for different User-Agents
+ USER_AGENTS = [
+ 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4
+ 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm
+ ]
+
+ config = None
+ formats = []
+ for user_agent in USER_AGENTS:
+ request = sanitized_Request(url)
+ request.add_header('User-Agent', user_agent)
+ webpage = self._download_webpage(request, video_id)
+
+ iframe_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
+ webpage, 'video iframe', default=None)
+ if iframe_url:
+ return self.url_result(iframe_url)
+
+ config = self._parse_json(self._html_search_regex(
+ r'data-player-config="([^"]+)"', webpage, 'data player config'),
+ video_id)
+ if 'playlist' not in config:
+ if 'vmapUrl' in config:
+ vmap_data = self._download_xml(config['vmapUrl'], video_id)
+ video_url = xpath_text(vmap_data, './/MediaFile').strip()
+ formats.append({
+ 'url': video_url,
+ })
+ break # same video regardless of UA
+ continue
+
+ video_url = config['playlist'][0]['source']
+
+ f = {
+ 'url': video_url,
+ }
+
+ m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
+ if m:
+ f.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ thumbnail = config.get('posterImageUrl')
+ duration = float_or_none(config.get('duration'))
+
+ return {
+ 'id': video_id,
+ 'title': 'TwitterCard',
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
+
+
+class TwitterIE(InfoExtractor):
+ IE_NAME = 'twitter'
+ _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)'
+ _TEMPLATE_URL = 'https://twitter.com/%s/status/%s'
+
+ _TESTS = [{
+ 'url': 'https://twitter.com/freethenipple/status/643211948184596480',
+ 'md5': 'db6612ec5d03355953c3ca9250c97e5e',
+ 'info_dict': {
+ 'id': '643211948184596480',
+ 'ext': 'mp4',
+ 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 12.922,
+ 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"',
+ 'uploader': 'FREE THE NIPPLE',
+ 'uploader_id': 'freethenipple',
+ },
+ }, {
+ 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
+ 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
+ 'info_dict': {
+ 'id': '657991469417025536',
+ 'ext': 'mp4',
+ 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai',
+ 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"',
+ 'thumbnail': 're:^https?://.*\.png',
+ 'uploader': 'Gifs',
+ 'uploader_id': 'giphz',
+ },
+ }, {
+ 'url': 'https://twitter.com/starwars/status/665052190608723968',
+ 'md5': '39b7199856dee6cd4432e72c74bc69d4',
+ 'info_dict': {
+ 'id': '665052190608723968',
+ 'ext': 'mp4',
+ 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.',
+ 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."',
+ 'uploader_id': 'starwars',
+ 'uploader': 'Star Wars',
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('user_id')
+ twid = mobj.group('id')
+
+ webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid)
+
+ username = remove_end(self._og_search_title(webpage), ' on Twitter')
+
+ title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”')
+
+ # strip 'https -_t.co_BJYgOjSeGA' junk from filenames
+ title = re.sub(r'\s+(https?://[^ ]+)', '', title)
+
+ info = {
+ 'uploader_id': user_id,
+ 'uploader': username,
+ 'webpage_url': url,
+ 'description': '%s on Twitter: "%s"' % (username, description),
+ 'title': username + ' - ' + title,
+ }
+
+ card_id = self._search_regex(
+ r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None)
+ if card_id:
+ card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': 'TwitterCard',
+ 'url': card_url,
+ })
+ return info
+
+ mobj = re.search(r'''(?x)
+ <video[^>]+class="animated-gif"[^>]+
+ (?:data-height="(?P<height>\d+)")?[^>]+
+ (?:data-width="(?P<width>\d+)")?[^>]+
+ (?:poster="(?P<poster>[^"]+)")?[^>]*>\s*
+ <source[^>]+video-src="(?P<url>[^"]+)"
+ ''', webpage)
+
+ if mobj:
+ info.update({
+ 'id': twid,
+ 'url': mobj.group('url'),
+ 'height': int_or_none(mobj.group('height')),
+ 'width': int_or_none(mobj.group('width')),
+ 'thumbnail': mobj.group('poster'),
+ })
+ return info
+
+ raise ExtractorError('There\'s not video in this tweet.')
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 4667ed83b..825172806 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -9,13 +9,15 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
class UdemyIE(InfoExtractor):
IE_NAME = 'udemy'
_VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
- _LOGIN_URL = 'https://www.udemy.com/join/login-submit/'
+ _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
+ _ORIGIN_URL = 'https://www.udemy.com'
_NETRC_MACHINE = 'udemy'
_TESTS = [{
@@ -57,7 +59,7 @@ class UdemyIE(InfoExtractor):
for header, value in headers.items():
url_or_request.add_header(header, value)
else:
- url_or_request = compat_urllib_request.Request(url_or_request, headers=headers)
+ url_or_request = sanitized_Request(url_or_request, headers=headers)
response = super(UdemyIE, self)._download_json(url_or_request, video_id, note)
self._handle_error(response)
@@ -69,34 +71,39 @@ class UdemyIE(InfoExtractor):
def _login(self):
(username, password) = self._get_login_info()
if username is None:
- raise ExtractorError(
- 'Udemy account is required, use --username and --password options to provide account credentials.',
- expected=True)
+ self.raise_login_required('Udemy account is required')
login_popup = self._download_webpage(
- 'https://www.udemy.com/join/login-popup?displayType=ajax&showSkipButton=1', None,
- 'Downloading login popup')
+ self._LOGIN_URL, None, 'Downloading login popup')
+
+ def is_logged(webpage):
+ return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<'])
- if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>':
+ # already logged in
+ if is_logged(login_popup):
return
- csrf = self._html_search_regex(
- r'<input type="hidden" name="csrf" value="(.+?)"',
- login_popup, 'csrf token')
+ login_form = self._form_hidden_inputs('login-form', login_popup)
- login_form = {
- 'email': username,
- 'password': password,
- 'csrf': csrf,
- 'displayType': 'json',
- 'isSubmitted': '1',
- }
- request = compat_urllib_request.Request(
+ login_form.update({
+ 'email': username.encode('utf-8'),
+ 'password': password.encode('utf-8'),
+ })
+
+ request = sanitized_Request(
self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- response = self._download_json(
+ request.add_header('Referer', self._ORIGIN_URL)
+ request.add_header('Origin', self._ORIGIN_URL)
+
+ response = self._download_webpage(
request, None, 'Logging in as %s' % username)
- if 'returnUrl' not in response:
+ if not is_logged(response):
+ error = self._html_search_regex(
+ r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py
new file mode 100644
index 000000000..ee35b7227
--- /dev/null
+++ b/youtube_dl/extractor/udn.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ ExtractorError,
+)
+from ..compat import compat_urlparse
+
+
+class UDNEmbedIE(InfoExtractor):
+ IE_DESC = '聯合影音'
+ _PROTOCOL_RELATIVE_VALID_URL = r'//video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
+ _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
+ _TESTS = [{
+ 'url': 'http://video.udn.com/embed/news/300040',
+ 'md5': 'de06b4c90b042c128395a88f0384817e',
+ 'info_dict': {
+ 'id': '300040',
+ 'ext': 'mp4',
+ 'title': '生物老師男變女 全校挺"做自己"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'https://video.udn.com/embed/news/300040',
+ 'only_matching': True,
+ }, {
+ # From https://video.udn.com/news/303776
+ 'url': 'https://video.udn.com/play/news/303776',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ page = self._download_webpage(url, video_id)
+
+ options = json.loads(js_to_json(self._html_search_regex(
+ r'var options\s*=\s*([^;]+);', page, 'video urls dictionary')))
+
+ video_urls = options['video']
+
+ if video_urls.get('youtube'):
+ return self.url_result(video_urls.get('youtube'), 'Youtube')
+
+ try:
+ del video_urls['youtube']
+ except KeyError:
+ pass
+
+ formats = [{
+ 'url': self._download_webpage(
+ compat_urlparse.urljoin(url, api_url), video_id,
+ 'retrieve url for %s video' % video_type),
+ 'format_id': video_type,
+ 'preference': 0 if video_type == 'mp4' else -1,
+ } for video_type, api_url in video_urls.items() if api_url]
+
+ if not formats:
+ raise ExtractorError('No videos found', expected=True)
+
+ self._sort_formats(formats)
+
+ thumbnail = None
+
+ if options.get('gallery') and len(options['gallery']):
+ thumbnail = options['gallery'][0].get('original')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': options['title'],
+ 'thumbnail': thumbnail
+ }
diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py
new file mode 100644
index 000000000..c4751050e
--- /dev/null
+++ b/youtube_dl/extractor/ultimedia.py
@@ -0,0 +1,105 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+ ExtractorError,
+ qualities,
+ unified_strdate,
+ clean_html,
+)
+
+
+class UltimediaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)'
+ _TESTS = [{
+ # news
+ 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
+ 'md5': '276a0e49de58c7e85d32b057837952a2',
+ 'info_dict': {
+ 'id': 's8uk0r',
+ 'ext': 'mp4',
+ 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées',
+ 'description': 'md5:3e5c8fd65791487333dda5db8aed32af',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20150317',
+ },
+ }, {
+ # music
+ 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8',
+ 'md5': '2ea3513813cf230605c7e2ffe7eca61c',
+ 'info_dict': {
+ 'id': 'xvpfp8',
+ 'ext': 'mp4',
+ 'title': "Two - C'est la vie (Clip)",
+ 'description': 'Two',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20150224',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ deliver_url = self._proto_relative_url(self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"',
+ webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':')
+
+ deliver_page = self._download_webpage(
+ deliver_url, video_id, 'Downloading iframe page')
+
+ if '>This video is currently not available' in deliver_page:
+ raise ExtractorError(
+ 'Video %s is currently not available' % video_id, expected=True)
+
+ player = self._parse_json(
+ self._search_regex(
+ r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on",
+ deliver_page, 'player'),
+ video_id)
+
+ quality = qualities(['flash', 'html5'])
+ formats = []
+ for mode in player['modes']:
+ video_url = mode.get('config', {}).get('file')
+ if not video_url:
+ continue
+ if re.match(r'https?://www\.youtube\.com/.+?', video_url):
+ return self.url_result(video_url, 'Youtube')
+ formats.append({
+ 'url': video_url,
+ 'format_id': mode.get('type'),
+ 'quality': quality(mode.get('type')),
+ })
+ self._sort_formats(formats)
+
+ thumbnail = player.get('image')
+
+ title = clean_html((
+ self._html_search_regex(
+ r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>',
+ webpage, 'title', default=None) or
+ self._search_regex(
+ r"var\s+nameVideo\s*=\s*'([^']+)'",
+ deliver_page, 'title')))
+
+ description = clean_html(self._html_search_regex(
+ r'(?s)<span>Description</span>(.+?)</p>', webpage,
+ 'description', fatal=False))
+
+ upload_date = unified_strdate(self._search_regex(
+ r'Ajouté le\s*<span>([^<]+)', webpage,
+ 'upload date', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index 68d03b999..73b05ecab 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -6,60 +6,105 @@ from .common import InfoExtractor
from ..compat import (
compat_urlparse,
)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ float_or_none,
+)
class UstreamIE(InfoExtractor):
- _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<videoID>\d+)'
+ _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
IE_NAME = 'ustream'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.ustream.tv/recorded/20274954',
'md5': '088f151799e8f572f84eb62f17d73e5c',
'info_dict': {
'id': '20274954',
'ext': 'flv',
- 'uploader': 'Young Americans for Liberty',
'title': 'Young Americans for Liberty February 7, 2012 2:28 AM',
+ 'description': 'Young Americans for Liberty February 7, 2012 2:28 AM',
+ 'timestamp': 1328577035,
+ 'upload_date': '20120207',
+ 'uploader': 'yaliberty',
+ 'uploader_id': '6780869',
},
- }
+ }, {
+ # From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444
+ # Title and uploader available only from params JSON
+ 'url': 'http://www.ustream.tv/embed/recorded/59307601?ub=ff0000&lc=ff0000&oc=ffffff&uc=ffffff&v=3&wmode=direct',
+ 'md5': '5a2abf40babeac9812ed20ae12d34e10',
+ 'info_dict': {
+ 'id': '59307601',
+ 'ext': 'flv',
+ 'title': '-CG11- Canada Games Figure Skating',
+ 'uploader': 'sportscanadatv',
+ },
+ 'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.',
+ }]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
- video_id = m.group('videoID')
+ video_id = m.group('id')
# some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990)
if m.group('type') == 'embed/recorded':
- video_id = m.group('videoID')
+ video_id = m.group('id')
desktop_url = 'http://www.ustream.tv/recorded/' + video_id
return self.url_result(desktop_url, 'Ustream')
if m.group('type') == 'embed':
- video_id = m.group('videoID')
+ video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
desktop_video_id = self._html_search_regex(
r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id')
desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id
return self.url_result(desktop_url, 'Ustream')
- video_url = 'http://tcdn.ustream.tv/video/%s' % video_id
- webpage = self._download_webpage(url, video_id)
+ params = self._download_json(
+ 'https://api.ustream.tv/videos/%s.json' % video_id, video_id)
- self.report_extraction(video_id)
+ error = params.get('error')
+ if error:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error), expected=True)
- video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
- webpage, 'title')
+ video = params['video']
- uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
- webpage, 'uploader', fatal=False, flags=re.DOTALL)
+ title = video['title']
+ filesize = float_or_none(video.get('file_size'))
- thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
- webpage, 'thumbnail', fatal=False)
+ formats = [{
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': format_id,
+ 'filesize': filesize,
+ } for format_id, video_url in video['media_urls'].items()]
+ self._sort_formats(formats)
+
+ description = video.get('description')
+ timestamp = int_or_none(video.get('created_at'))
+ duration = float_or_none(video.get('length'))
+ view_count = int_or_none(video.get('views'))
+
+ uploader = video.get('owner', {}).get('username')
+ uploader_id = video.get('owner', {}).get('id')
+
+ thumbnails = [{
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ } for thumbnail_id, thumbnail_url in video.get('thumbnail', {}).items()]
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'flv',
- 'title': video_title,
+ 'title': title,
+ 'description': description,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'view_count': view_count,
'uploader': uploader,
- 'thumbnail': thumbnail,
+ 'uploader_id': uploader_id,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py
new file mode 100644
index 000000000..9369abaf8
--- /dev/null
+++ b/youtube_dl/extractor/varzesh3.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class Varzesh3IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?'
+ _TEST = {
+ 'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/',
+ 'md5': '2a933874cb7dce4366075281eb49e855',
+ 'info_dict': {
+ 'id': '76337',
+ 'ext': 'mp4',
+ 'title': '۵ واکنش برتر دروازه‌بانان؛هفته ۲۶ بوندسلیگا',
+ 'description': 'فصل ۲۰۱۵-۲۰۱۴',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_url = self._search_regex(
+ r'<source[^>]+src="([^"]+)"', webpage, 'video url')
+
+ title = self._og_search_title(webpage)
+ description = self._html_search_regex(
+ r'(?s)<div class="matn">(.+?)</div>',
+ webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ video_id = self._search_regex(
+ r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'",
+ webpage, display_id, default=display_id)
+
+ return {
+ 'url': video_url,
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index dd026748d..1e740fbe6 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -4,10 +4,11 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse,
- compat_urllib_request,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
@@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- redirect_page, urlh = self._download_webpage_handle(url, video_id)
- new_location = self._search_regex(r'window\.location = \'(.*)\';',
- redirect_page, 'redirect location')
- redirect_url = urlh.geturl() + new_location
- webpage = self._download_webpage(redirect_url, video_id,
+ # need to get the page 3 times for the correct jsSecretToken cookie
+ # which is necessary for the correct title
+ def get_session_id():
+ redirect_page = self._download_webpage(url, video_id)
+ session_id_url = self._search_regex(
+ r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
+ 'session id url')
+ self._download_webpage(
+ compat_urlparse.urljoin(url, session_id_url), video_id,
+ 'Getting session id')
+
+ get_session_id()
+ get_session_id()
+
+ webpage = self._download_webpage(url, video_id,
'Downloading redirect page')
title = self._html_search_regex(r'<title>(.*)</title>',
@@ -38,7 +49,7 @@ class Vbox7IE(InfoExtractor):
info_url = "http://vbox7.com/play/magare.do"
data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id})
- info_request = compat_urllib_request.Request(info_url, data)
+ info_request = sanitized_Request(info_url, data)
info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage')
if info_response is None:
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py
index 96353f525..0d8d832cc 100644
--- a/youtube_dl/extractor/veehd.py
+++ b/youtube_dl/extractor/veehd.py
@@ -5,6 +5,7 @@ import json
from .common import InfoExtractor
from ..compat import (
+ compat_urllib_parse_unquote,
compat_urlparse,
)
from ..utils import (
@@ -17,7 +18,9 @@ from ..utils import (
class VeeHDIE(InfoExtractor):
_VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
- _TEST = {
+ # Seems VeeHD videos have multiple copies on several servers, all of
+ # whom have different MD5 checksums, so omit md5 field in all tests
+ _TESTS = [{
'url': 'http://veehd.com/video/4639434_Solar-Sinter',
'info_dict': {
'id': '4639434',
@@ -26,7 +29,26 @@ class VeeHDIE(InfoExtractor):
'uploader_id': 'VideoEyes',
'description': 'md5:46a840e8692ddbaffb5f81d9885cb457',
},
- }
+ 'skip': 'Video deleted',
+ }, {
+ 'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling',
+ 'info_dict': {
+ 'id': '4905758',
+ 'ext': 'mp4',
+ 'title': 'Elysian Fields - Channeling',
+ 'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b',
+ 'uploader_id': 'spotted',
+ }
+ }, {
+ 'url': 'http://veehd.com/video/2046729_2012-2009-DivX-Trailer',
+ 'info_dict': {
+ 'id': '2046729',
+ 'ext': 'avi',
+ 'title': '2012 (2009) DivX Trailer',
+ 'description': 'md5:75435ee95255e6a9838ac6f6f3a2396b',
+ 'uploader_id': 'Movie_Trailers',
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -48,13 +70,21 @@ class VeeHDIE(InfoExtractor):
player_page = self._download_webpage(
player_url, video_id, 'Downloading player page')
+ video_url = None
+
config_json = self._search_regex(
r'value=\'config=({.+?})\'', player_page, 'config json', default=None)
if config_json:
config = json.loads(config_json)
- video_url = compat_urlparse.unquote(config['clip']['url'])
- else:
+ video_url = compat_urllib_parse_unquote(config['clip']['url'])
+
+ if not video_url:
+ video_url = self._html_search_regex(
+ r'<embed[^>]+type="video/divx"[^>]+src="([^"]+)"',
+ player_page, 'video url', default=None)
+
+ if not video_url:
iframe_src = self._search_regex(
r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url')
iframe_url = 'http://veehd.com/%s' % iframe_src
@@ -82,7 +112,6 @@ class VeeHDIE(InfoExtractor):
'id': video_id,
'title': title,
'url': video_url,
- 'ext': 'mp4',
'uploader_id': uploader_id,
'thumbnail': thumbnail,
'description': description,
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index 01e258e32..9633f7ffe 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -4,12 +4,10 @@ import re
import json
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
from ..utils import (
int_or_none,
ExtractorError,
+ sanitized_Request,
)
@@ -110,7 +108,7 @@ class VeohIE(InfoExtractor):
if 'class="adultwarning-container"' in webpage:
self.report_age_confirmation()
age_limit = 18
- request = compat_urllib_request.Request(url)
+ request = sanitized_Request(url)
request.add_header('Cookie', 'confirmedAdult=true')
webpage = self._download_webpage(request, video_id)
diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py
new file mode 100644
index 000000000..1a0ff3395
--- /dev/null
+++ b/youtube_dl/extractor/vessel.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ parse_iso8601,
+ sanitized_Request,
+)
+
+
+class VesselIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+ _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'
+ _LOGIN_URL = 'https://www.vessel.com/api/account/login'
+ _NETRC_MACHINE = 'vessel'
+ _TEST = {
+ 'url': 'https://www.vessel.com/videos/HDN7G5UMs',
+ 'md5': '455cdf8beb71c6dd797fd2f3818d05c4',
+ 'info_dict': {
+ 'id': 'HDN7G5UMs',
+ 'ext': 'mp4',
+ 'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20150317',
+ 'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',
+ 'timestamp': int,
+ },
+ }
+
+ @staticmethod
+ def make_json_request(url, data):
+ payload = json.dumps(data).encode('utf-8')
+ req = sanitized_Request(url, payload)
+ req.add_header('Content-Type', 'application/json; charset=utf-8')
+ return req
+
+ @staticmethod
+ def find_assets(data, asset_type, asset_id=None):
+ for asset in data.get('assets', []):
+ if not asset.get('type') == asset_type:
+ continue
+ elif asset_id is not None and not asset.get('id') == asset_id:
+ continue
+ else:
+ yield asset
+
+ def _check_access_rights(self, data):
+ access_info = data.get('__view', {})
+ if not access_info.get('allow_access', True):
+ err_code = access_info.get('error_code') or ''
+ if err_code == 'ITEM_PAID_ONLY':
+ raise ExtractorError(
+ 'This video requires subscription.', expected=True)
+ else:
+ raise ExtractorError(
+ 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True)
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ data = {
+ 'client_id': 'web',
+ 'type': 'password',
+ 'user_key': username,
+ 'password': password,
+ }
+ login_request = VesselIE.make_json_request(self._LOGIN_URL, data)
+ self._download_webpage(login_request, None, False, 'Wrong login info')
+
+ def _real_initialize(self):
+ self._login()
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ data = self._parse_json(self._search_regex(
+ r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id)
+ asset_id = data['model']['data']['id']
+
+ req = VesselIE.make_json_request(
+ self._API_URL_TEMPLATE % asset_id, {'client': 'web'})
+ data = self._download_json(req, video_id)
+ video_asset_id = data.get('main_video_asset')
+
+ self._check_access_rights(data)
+
+ try:
+ video_asset = next(
+ VesselIE.find_assets(data, 'video', asset_id=video_asset_id))
+ except StopIteration:
+ raise ExtractorError('No video assets found')
+
+ formats = []
+ for f in video_asset.get('sources', []):
+ if f['name'] == 'hls-index':
+ formats.extend(self._extract_m3u8_formats(
+ f['location'], video_id, ext='mp4', m3u8_id='m3u8'))
+ else:
+ formats.append({
+ 'format_id': f['name'],
+ 'tbr': f.get('bitrate'),
+ 'height': f.get('height'),
+ 'width': f.get('width'),
+ 'url': f['location'],
+ })
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for im_asset in VesselIE.find_assets(data, 'image'):
+ thumbnails.append({
+ 'url': im_asset['location'],
+ 'width': im_asset.get('width', 0),
+ 'height': im_asset.get('height', 0),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': data['title'],
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': data.get('short_description'),
+ 'duration': data.get('duration'),
+ 'comment_count': data.get('comment_count'),
+ 'like_count': data.get('like_count'),
+ 'view_count': data.get('view_count'),
+ 'timestamp': parse_iso8601(data.get('released_at')),
+ }
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index c17094f81..571289421 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -1,15 +1,13 @@
from __future__ import unicode_literals
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
-)
+from ..compat import compat_etree_fromstring
from ..utils import (
ExtractorError,
int_or_none,
+ sanitized_Request,
)
@@ -73,7 +71,7 @@ class VevoIE(InfoExtractor):
_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
def _real_initialize(self):
- req = compat_urllib_request.Request(
+ req = sanitized_Request(
'http://www.vevo.com/auth', data=b'')
webpage = self._download_webpage(
req, None,
@@ -97,7 +95,7 @@ class VevoIE(InfoExtractor):
if last_version['version'] == -1:
raise ExtractorError('Unable to extract last version of the video')
- renditions = xml.etree.ElementTree.fromstring(last_version['data'])
+ renditions = compat_etree_fromstring(last_version['data'])
formats = []
# Already sorted from worst to best quality
for rend in renditions.findall('rendition'):
@@ -114,7 +112,7 @@ class VevoIE(InfoExtractor):
def _formats_from_smil(self, smil_xml):
formats = []
- smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
+ smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))
els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
for el in els:
src = el.attrib['src']
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index 2f111bf7e..f38a72fde 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -4,11 +4,26 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+)
class VGTVIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?vgtv\.no/#!/(?:.*)/(?P<id>[0-9]+)'
+ IE_DESC = 'VGTV and BTTV'
+ _VALID_URL = r'''(?x)
+ (?:
+ vgtv:|
+ http://(?:www\.)?
+ )
+ (?P<host>vgtv|bt)
+ (?:
+ :|
+ \.no/(?:tv/)?\#!/(?:video|live)/
+ )
+ (?P<id>[0-9]+)
+ '''
_TESTS = [
{
# streamType: vod
@@ -47,16 +62,16 @@ class VGTVIE(InfoExtractor):
},
{
# streamType: live
- 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen',
+ 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
'info_dict': {
- 'id': '100015',
+ 'id': '113063',
'ext': 'flv',
- 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!',
- 'description': 'md5:9a60cc23fa349f761628924e56eeec2d',
+ 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:b3743425765355855f88e096acc93231',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 0,
- 'timestamp': 1407423348,
- 'upload_date': '20140807',
+ 'timestamp': 1432975582,
+ 'upload_date': '20150530',
'view_count': int,
},
'params': {
@@ -64,25 +79,47 @@ class VGTVIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ host = mobj.group('host')
+
+ HOST_WEBSITES = {
+ 'vgtv': 'vgtv',
+ 'bt': 'bttv',
+ }
+
data = self._download_json(
- 'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id,
+ 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website'
+ % (host, video_id, HOST_WEBSITES[host]),
video_id, 'Downloading media JSON')
+ if data.get('status') == 'inactive':
+ raise ExtractorError(
+ 'Video %s is no longer available' % video_id, expected=True)
+
streams = data['streamUrls']
+ stream_type = data.get('streamType')
formats = []
hls_url = streams.get('hls')
if hls_url:
- formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4'))
+ formats.extend(self._extract_m3u8_formats(
+ hls_url, video_id, 'mp4', m3u8_id='hls'))
hds_url = streams.get('hds')
- if hds_url:
- formats.extend(self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id))
+ # wasLive hds are always 404
+ if hds_url and stream_type != 'wasLive':
+ formats.extend(self._extract_f4m_formats(
+ hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+ video_id, f4m_id='hds'))
mp4_url = streams.get('mp4')
if mp4_url:
@@ -107,11 +144,60 @@ class VGTVIE(InfoExtractor):
return {
'id': video_id,
- 'title': data['title'],
+ 'title': self._live_title(data['title']),
'description': data['description'],
'thumbnail': data['images']['main'] + '?t[]=900x506q80',
'timestamp': data['published'],
'duration': float_or_none(data['duration'], 1000),
'view_count': data['displays'],
'formats': formats,
+ 'is_live': True if stream_type == 'live' else False,
}
+
+
+class BTArticleIE(InfoExtractor):
+ IE_NAME = 'bt:article'
+ IE_DESC = 'Bergens Tidende Articles'
+ _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'
+ _TEST = {
+ 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html',
+ 'md5': 'd055e8ee918ef2844745fcfd1a4175fb',
+ 'info_dict': {
+ 'id': '23199',
+ 'ext': 'mp4',
+ 'title': 'Alrekstad internat',
+ 'description': 'md5:dc81a9056c874fedb62fc48a300dac58',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 191,
+ 'timestamp': 1289991323,
+ 'upload_date': '20101117',
+ 'view_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage(url, self._match_id(url))
+ video_id = self._search_regex(
+ r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id')
+ return self.url_result('vgtv:bt:%s' % video_id, 'VGTV')
+
+
+class BTVestlendingenIE(InfoExtractor):
+ IE_NAME = 'bt:vestlendingen'
+ IE_DESC = 'Bergens Tidende - Vestlendingen'
+ _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',
+ 'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+ 'info_dict': {
+ 'id': '86588',
+ 'ext': 'mov',
+ 'title': 'Otto Wollertsen',
+ 'description': 'Vestlendingen Otto Fredrik Wollertsen',
+ 'timestamp': 1430473209,
+ 'upload_date': '20150501',
+ },
+ }
+
+ def _real_extract(self, url):
+ return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream')
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index 71f520fb5..01af7a995 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -1,5 +1,4 @@
from __future__ import unicode_literals
-import re
from .common import InfoExtractor
from .ooyala import OoyalaIE
@@ -7,31 +6,34 @@ from ..utils import ExtractorError
class ViceIE(InfoExtractor):
- _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
- _TEST = {
- 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
- 'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
- 'ext': 'mp4',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- },
- 'params': {
- # Requires ffmpeg (m3u8 manifest)
- 'skip_download': True,
- },
- }
+ _TESTS = [
+ {
+ 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'mp4',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ },
+ 'params': {
+ # Requires ffmpeg (m3u8 manifest)
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'only_matching': True,
+ }
+ ]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
- webpage = self._download_webpage(url, name)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
try:
embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', webpage,
'ooyala embed code')
ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
- print(ooyala_url)
except ExtractorError:
raise ExtractorError('The page doesn\'t contain a video', expected=True)
return self.url_result(ooyala_url, ie='Ooyala')
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py
index 8516a2940..40ffbad2a 100644
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@@ -4,9 +4,7 @@ from .common import InfoExtractor
from ..utils import (
float_or_none,
int_or_none,
-)
-from ..compat import (
- compat_urllib_request
+ sanitized_Request,
)
@@ -65,7 +63,7 @@ class ViddlerIE(InfoExtractor):
'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' %
video_id)
headers = {'Referer': 'http://static.cdn-ec.viddler.com/js/arpeggio/v2/embed.html'}
- request = compat_urllib_request.Request(json_url, None, headers)
+ request = sanitized_Request(json_url, None, headers)
data = self._download_json(request, video_id)['video']
formats = []
diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py
deleted file mode 100644
index 0eb3d9414..000000000
--- a/youtube_dl/extractor/videobam.py
+++ /dev/null
@@ -1,81 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-import json
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class VideoBamIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)'
-
- _TESTS = [
- {
- 'url': 'http://videobam.com/OiJQM',
- 'md5': 'db471f27763a531f10416a0c58b5a1e0',
- 'info_dict': {
- 'id': 'OiJQM',
- 'ext': 'mp4',
- 'title': 'Is Alcohol Worse Than Ecstasy?',
- 'description': 'md5:d25b96151515c91debc42bfbb3eb2683',
- 'uploader': 'frihetsvinge',
- },
- },
- {
- 'url': 'http://videobam.com/pqLvq',
- 'md5': 'd9a565b5379a99126ef94e1d7f9a383e',
- 'note': 'HD video',
- 'info_dict': {
- 'id': 'pqLvq',
- 'ext': 'mp4',
- 'title': '_',
- }
- },
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page')
-
- formats = []
-
- for preference, format_id in enumerate(['low', 'high']):
- mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page)
- if not mobj:
- continue
- formats.append({
- 'url': mobj.group('url'),
- 'ext': 'mp4',
- 'format_id': format_id,
- 'preference': preference,
- })
-
- if not formats:
- player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config'))
- formats = [{
- 'url': item['url'],
- 'ext': 'mp4',
- } for item in player_config['playlist'] if 'autoPlay' in item]
-
- self._sort_formats(formats)
-
- title = self._og_search_title(page, default='_', fatal=False)
- description = self._og_search_description(page, default=None)
- thumbnail = self._og_search_thumbnail(page)
- uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None)
- view_count = int_or_none(
- self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False))
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'view_count': view_count,
- 'formats': formats,
- 'age_limit': 18,
- }
diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py
index 94f9e9be9..cd3f50a63 100644
--- a/youtube_dl/extractor/videofyme.py
+++ b/youtube_dl/extractor/videofyme.py
@@ -2,8 +2,8 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- find_xpath_attr,
int_or_none,
+ parse_iso8601,
)
@@ -18,33 +18,35 @@ class VideofyMeIE(InfoExtractor):
'id': '1100701',
'ext': 'mp4',
'title': 'This is VideofyMe',
- 'description': None,
+ 'description': '',
+ 'upload_date': '20130326',
+ 'timestamp': 1364288959,
'uploader': 'VideofyMe',
'uploader_id': 'thisisvideofyme',
'view_count': int,
+ 'likes': int,
+ 'comment_count': int,
},
-
}
def _real_extract(self, url):
video_id = self._match_id(url)
- config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id,
- video_id)
- video = config.find('video')
- sources = video.find('sources')
- url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key)
- for key in ['on', 'av', 'off']] if node is not None)
- video_url = url_node.find('url').text
- view_count = int_or_none(self._search_regex(
- r'([0-9]+)', video.find('views').text, 'view count', fatal=False))
+
+ config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo']
+
+ video = config.get('video')
+ blog = config.get('blog', {})
return {
'id': video_id,
- 'title': video.find('title').text,
- 'url': video_url,
- 'thumbnail': video.find('thumb').text,
- 'description': video.find('description').text,
- 'uploader': config.find('blog/name').text,
- 'uploader_id': video.find('identifier').text,
- 'view_count': view_count,
+ 'title': video['title'],
+ 'url': video['sources']['source']['url'],
+ 'thumbnail': video.get('thumb'),
+ 'description': video.get('description'),
+ 'timestamp': parse_iso8601(video.get('date')),
+ 'uploader': blog.get('name'),
+ 'uploader_id': blog.get('identifier'),
+ 'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)),
+ 'likes': int_or_none(video.get('likes')),
+ 'comment_count': int_or_none(video.get('nrOfComments')),
}
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
deleted file mode 100644
index ebd2a3dca..000000000
--- a/youtube_dl/extractor/videolecturesnet.py
+++ /dev/null
@@ -1,70 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- find_xpath_attr,
- int_or_none,
- parse_duration,
- unified_strdate,
-)
-
-
-class VideoLecturesNetIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
- IE_NAME = 'videolectures.net'
-
- _TEST = {
- 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
- 'info_dict': {
- 'id': 'promogram_igor_mekjavic_eng',
- 'ext': 'mp4',
- 'title': 'Automatics, robotics and biocybernetics',
- 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
- 'upload_date': '20130627',
- 'duration': 565,
- 'thumbnail': 're:http://.*\.jpg',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id
- smil = self._download_xml(smil_url, video_id)
-
- title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content']
- description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract')
- description = (
- None if description_el is None
- else description_el.attrib['content'])
- upload_date = unified_strdate(
- find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content'])
-
- switch = smil.find('.//switch')
- duration = parse_duration(switch.attrib.get('dur'))
- thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail')
- thumbnail = (
- None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
-
- formats = [{
- 'url': v.attrib['src'],
- 'width': int_or_none(v.attrib.get('width')),
- 'height': int_or_none(v.attrib.get('height')),
- 'filesize': int_or_none(v.attrib.get('size')),
- 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
- 'ext': v.attrib.get('ext'),
- } for v in switch.findall('./video')
- if v.attrib.get('proto') == 'http']
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'upload_date': upload_date,
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 273030316..87aca327b 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -4,63 +4,50 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
-from ..utils import (
- ExtractorError,
- remove_start,
-)
+from ..utils import sanitized_Request
class VideoMegaIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
- (?:www\.)?videomega\.tv/
- (?:iframe\.php)?\?ref=(?P<id>[A-Za-z0-9]+)
- '''
- _TEST = {
- 'url': 'http://videomega.tv/?ref=QR0HCUHI1661IHUCH0RQ',
- 'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
+ _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
+ _TESTS = [{
+ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
+ 'md5': 'cc1920a58add3f05c6a93285b84fb3aa',
'info_dict': {
- 'id': 'QR0HCUHI1661IHUCH0RQ',
+ 'id': 'AOSQBJYKIDDIKYJBQSOA',
'ext': 'mp4',
- 'title': 'Big Buck Bunny',
+ 'title': '1254207',
'thumbnail': 're:^https?://.*\.jpg$',
}
- }
+ }, {
+ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://videomega.tv/view.php?ref=090051111052065112106089103052052103089106112065052111051090',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- iframe_url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id)
- req = compat_urllib_request.Request(iframe_url)
+ iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id
+ req = sanitized_Request(iframe_url)
req.add_header('Referer', url)
+ req.add_header('Cookie', 'noadvtday=0')
webpage = self._download_webpage(req, video_id)
- try:
- escaped_data = re.findall(r'unescape\("([^"]+)"\)', webpage)[-1]
- except IndexError:
- raise ExtractorError('Unable to extract escaped data')
-
- playlist = compat_urllib_parse.unquote(escaped_data)
-
+ title = self._html_search_regex(
+ r'<title>(.+?)</title>', webpage, 'title')
+ title = re.sub(
+ r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
thumbnail = self._search_regex(
- r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False)
- video_url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL')
- title = remove_start(self._html_search_regex(
- r'<title>(.*?)</title>', webpage, 'title'), 'VideoMega.tv - ')
-
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- }]
- self._sort_formats(formats)
+ r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+ video_url = self._search_regex(
+ r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')
return {
'id': video_id,
'title': title,
- 'formats': formats,
+ 'url': video_url,
'thumbnail': thumbnail,
'http_headers': {
'Referer': iframe_url,
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
index ececc7ee0..591024ead 100644
--- a/youtube_dl/extractor/videott.py
+++ b/youtube_dl/extractor/videott.py
@@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor):
formats = [
{
- 'url': base64.b64decode(res['u']).decode('utf-8'),
+ 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'),
'ext': 'flv',
'format_id': res['l'],
} for res in settings['res'] if res['u']
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index 5c89824c1..3d63ed4f0 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -1,18 +1,18 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_HTTPError
from ..utils import (
+ ExtractorError,
int_or_none,
float_or_none,
- str_to_int,
+ parse_iso8601,
)
class VidmeIE(InfoExtractor):
_VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://vid.me/QNB',
'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
'info_dict': {
@@ -20,49 +20,185 @@ class VidmeIE(InfoExtractor):
'ext': 'mp4',
'title': 'Fishing for piranha - the easy way',
'description': 'source: https://www.facebook.com/photo.php?v=312276045600871',
- 'duration': 119.92,
+ 'thumbnail': 're:^https?://.*\.jpg',
'timestamp': 1406313244,
'upload_date': '20140725',
+ 'age_limit': 0,
+ 'duration': 119.92,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ }, {
+ 'url': 'https://vid.me/Gc6M',
+ 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
+ 'info_dict': {
+ 'id': 'Gc6M',
+ 'ext': 'mp4',
+ 'title': 'O Mere Dil ke chain - Arnav and Khushi VM',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1441211642,
+ 'upload_date': '20150902',
+ 'uploader': 'SunshineM',
+ 'uploader_id': '3552827',
+ 'age_limit': 0,
+ 'duration': 223.72,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # tests uploader field
+ 'url': 'https://vid.me/4Iib',
+ 'info_dict': {
+ 'id': '4Iib',
+ 'ext': 'mp4',
+ 'title': 'The Carver',
+ 'description': 'md5:e9c24870018ae8113be936645b93ba3c',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1433203629,
+ 'upload_date': '20150602',
+ 'uploader': 'Thomas',
+ 'uploader_id': '109747',
+ 'age_limit': 0,
+ 'duration': 97.859999999999999,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+ 'url': 'https://vid.me/e/Wmur',
+ 'info_dict': {
+ 'id': 'Wmur',
+ 'ext': 'mp4',
+ 'title': 'naked smoking & stretching',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1430931613,
+ 'upload_date': '20150506',
+ 'uploader': 'naked-yogi',
+ 'uploader_id': '1638622',
+ 'age_limit': 18,
+ 'duration': 653.26999999999998,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # nsfw, user-disabled
+ 'url': 'https://vid.me/dzGJ',
+ 'only_matching': True,
+ }, {
+ # suspended
+ 'url': 'https://vid.me/Ox3G',
+ 'only_matching': True,
+ }, {
+ # deleted
+ 'url': 'https://vid.me/KTPm',
+ 'only_matching': True,
+ }, {
+ # no formats in the API response
+ 'url': 'https://vid.me/e5g',
+ 'info_dict': {
+ 'id': 'e5g',
+ 'ext': 'mp4',
+ 'title': 'Video upload (e5g)',
'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1401480195,
+ 'upload_date': '20140530',
+ 'uploader': None,
+ 'uploader_id': None,
+ 'age_limit': 0,
+ 'duration': 483,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
},
- }
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+
+ try:
+ response = self._download_json(
+ 'https://api.vid.me/videoByUrl/%s' % video_id, video_id)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ response = self._parse_json(e.cause.read(), video_id)
+ else:
+ raise
+
+ error = response.get('error')
+ if error:
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error), expected=True)
+
+ video = response['video']
+
+ if video.get('state') == 'deleted':
+ raise ExtractorError(
+ 'Vidme said: Sorry, this video has been deleted.',
+ expected=True)
+
+ if video.get('state') in ('user-disabled', 'suspended'):
+ raise ExtractorError(
+ 'Vidme said: This video has been suspended either due to a copyright claim, '
+ 'or for violating the terms of use.',
+ expected=True)
+
+ formats = [{
+ 'format_id': f.get('type'),
+ 'url': f['uri'],
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'preference': 0 if f.get('type', '').endswith('clip') else 1,
+ } for f in video.get('formats', []) if f.get('uri')]
- webpage = self._download_webpage(url, video_id)
+ if not formats and video.get('complete_url'):
+ formats.append({
+ 'url': video.get('complete_url'),
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ })
- video_url = self._html_search_regex(r'<source src="([^"]+)"', webpage, 'video URL')
+ self._sort_formats(formats)
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage, default='')
- thumbnail = self._og_search_thumbnail(webpage)
- timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False))
- width = int_or_none(self._og_search_property('video:width', webpage, fatal=False))
- height = int_or_none(self._og_search_property('video:height', webpage, fatal=False))
- duration = float_or_none(self._html_search_regex(
- r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))
- view_count = str_to_int(self._html_search_regex(
- r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
- like_count = str_to_int(self._html_search_regex(
- r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',
- webpage, 'like count', fatal=False))
- comment_count = str_to_int(self._html_search_regex(
- r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">',
- webpage, 'comment count', fatal=False))
+ title = video['title']
+ description = video.get('description')
+ thumbnail = video.get('thumbnail_url')
+ timestamp = parse_iso8601(video.get('date_created'), ' ')
+ uploader = video.get('user', {}).get('username')
+ uploader_id = video.get('user', {}).get('user_id')
+ age_limit = 18 if video.get('nsfw') is True else 0
+ duration = float_or_none(video.get('duration'))
+ view_count = int_or_none(video.get('view_count'))
+ like_count = int_or_none(video.get('likes_count'))
+ comment_count = int_or_none(video.get('comment_count'))
return {
'id': video_id,
- 'url': video_url,
- 'title': title,
+ 'title': title or 'Video upload (%s)' % video_id,
'description': description,
'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'age_limit': age_limit,
'timestamp': timestamp,
- 'width': width,
- 'height': height,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
'comment_count': comment_count,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index 08a5a7b8d..2ba9f31df 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -20,8 +20,14 @@ class VidziIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url')
+ video_host = self._html_search_regex(
+ r'id=\'vplayer\'><img src="http://(.*?)/i', webpage,
+ 'video host')
+ video_hash = self._html_search_regex(
+ r'\|([a-z0-9]+)\|hls\|type', webpage, 'video_hash')
+ ext = self._html_search_regex(
+ r'\|tracks\|([a-z0-9]+)\|', webpage, 'video ext')
+ video_url = 'http://' + video_host + '/' + video_hash + '/v.' + ext
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index 619039e51..c76c20614 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import re
+import itertools
from .common import InfoExtractor
@@ -38,11 +39,14 @@ class VierIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
- r'"nid"\s*:\s*"(\d+)"', webpage, 'video id')
+ [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
+ webpage, 'video id')
application = self._search_regex(
- r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod')
+ [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+ webpage, 'application', default='vier_vod')
filename = self._search_regex(
- r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename')
+ [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+ webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
@@ -88,31 +92,27 @@ class VierVideosIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
program = mobj.group('program')
- webpage = self._download_webpage(url, program)
-
page_id = mobj.group('page')
if page_id:
page_id = int(page_id)
start_page = page_id
- last_page = start_page + 1
playlist_id = '%s-page%d' % (program, page_id)
else:
start_page = 0
- last_page = int(self._search_regex(
- r'videos\?page=(\d+)">laatste</a>',
- webpage, 'last page', default=0)) + 1
playlist_id = program
entries = []
- for current_page_id in range(start_page, last_page):
+ for current_page_id in itertools.count(start_page):
current_page = self._download_webpage(
'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id),
program,
- 'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage
+ 'Downloading page %d' % (current_page_id + 1))
page_entries = [
self.url_result('http://www.vier.be' + video_url, 'Vier')
for video_url in re.findall(
r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
entries.extend(page_entries)
+ if page_id or '>Meer<' not in current_page:
+ break
return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
new file mode 100644
index 000000000..185b1c119
--- /dev/null
+++ b/youtube_dl/extractor/viewster.py
@@ -0,0 +1,172 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_urllib_parse,
+ compat_urllib_parse_unquote,
+)
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ sanitized_Request,
+ HEADRequest,
+)
+
+
+class ViewsterIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?viewster\.com/(?:serie|movie)/(?P<id>\d+-\d+-\d+)'
+ _TESTS = [{
+ # movie, Type=Movie
+ 'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/',
+ 'md5': 'e642d1b27fcf3a4ffa79f194f5adde36',
+ 'info_dict': {
+ 'id': '1140-11855-000',
+ 'ext': 'mp4',
+ 'title': 'The listening Project',
+ 'description': 'md5:bac720244afd1a8ea279864e67baa071',
+ 'timestamp': 1214870400,
+ 'upload_date': '20080701',
+ 'duration': 4680,
+ },
+ }, {
+ # series episode, Type=Episode
+ 'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/',
+ 'md5': '9243079a8531809efe1b089db102c069',
+ 'info_dict': {
+ 'id': '1284-19427-001',
+ 'ext': 'mp4',
+ 'title': 'The World and a Wall',
+ 'description': 'md5:24814cf74d3453fdf5bfef9716d073e3',
+ 'timestamp': 1428192000,
+ 'upload_date': '20150405',
+ 'duration': 1500,
+ },
+ }, {
+ # serie, Type=Serie
+ 'url': 'http://www.viewster.com/serie/1303-19426-000/',
+ 'info_dict': {
+ 'id': '1303-19426-000',
+ 'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?',
+ 'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11',
+ },
+ 'playlist_count': 13,
+ }, {
+ # unfinished serie, no Type
+ 'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/',
+ 'info_dict': {
+ 'id': '1284-19427-000',
+ 'title': 'Baby Steps—Season 2',
+ 'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1',
+ },
+ 'playlist_mincount': 16,
+ }, {
+ # geo restricted series
+ 'url': 'https://www.viewster.com/serie/1280-18794-002/',
+ 'only_matching': True,
+ }, {
+ # geo restricted video
+ 'url': 'https://www.viewster.com/serie/1280-18794-002/what-is-extraterritoriality-lawo/',
+ 'only_matching': True,
+ }]
+
+ _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
+
+ def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):
+ request = sanitized_Request(url)
+ request.add_header('Accept', self._ACCEPT_HEADER)
+ request.add_header('Auth-token', self._AUTH_TOKEN)
+ return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ # Get 'api_token' cookie
+ self._request_webpage(HEADRequest('http://www.viewster.com/'), video_id)
+ cookies = self._get_cookies('http://www.viewster.com/')
+ self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value)
+
+ info = self._download_json(
+ 'https://public-api.viewster.com/search/%s' % video_id,
+ video_id, 'Downloading entry JSON')
+
+ entry_id = info.get('Id') or info['id']
+
+ # unfinished serie has no Type
+ if info.get('Type') in ('Serie', None):
+ try:
+ episodes = self._download_json(
+ 'https://public-api.viewster.com/series/%s/episodes' % entry_id,
+ video_id, 'Downloading series JSON')
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ self.raise_geo_restricted()
+ else:
+ raise
+ entries = [
+ self.url_result(
+ 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster')
+ for episode in episodes]
+ title = (info.get('Title') or info['Synopsis']['Title']).strip()
+ description = info.get('Synopsis', {}).get('Detailed')
+ return self.playlist_result(entries, video_id, title, description)
+
+ formats = []
+ for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'):
+ media = self._download_json(
+ 'https://public-api.viewster.com/movies/%s/video?mediaType=%s'
+ % (entry_id, compat_urllib_parse.quote(media_type)),
+ video_id, 'Downloading %s JSON' % media_type, fatal=False)
+ if not media:
+ continue
+ video_url = media.get('Uri')
+ if not video_url:
+ continue
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
+ video_url += '&' if '?' in video_url else '?'
+ video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id='hds'))
+ elif ext == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='hls',
+ fatal=False) # m3u8 sometimes fail
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ else:
+ format_id = media.get('Bitrate')
+ f = {
+ 'url': video_url,
+ 'format_id': 'mp4-%s' % format_id,
+ 'height': int_or_none(media.get('Height')),
+ 'width': int_or_none(media.get('Width')),
+ 'preference': 1,
+ }
+ if format_id and not f['height']:
+ f['height'] = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ formats.append(f)
+
+ if not formats and not info.get('LanguageSets') and not info.get('VODSettings'):
+ self.raise_geo_restricted()
+
+ self._sort_formats(formats)
+
+ synopsis = info.get('Synopsis', {})
+ # Prefer title outside synopsis since it's less messy
+ title = (info.get('Title') or synopsis['Title']).strip()
+ description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short')
+ duration = int_or_none(info.get('Duration'))
+ timestamp = parse_iso8601(info.get('ReleaseDate'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py
new file mode 100644
index 000000000..525e303d4
--- /dev/null
+++ b/youtube_dl/extractor/viidea.py
@@ -0,0 +1,188 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urlparse,
+ compat_str,
+)
+from ..utils import (
+ parse_duration,
+ js_to_json,
+ parse_iso8601,
+)
+
+
+class ViideaIE(InfoExtractor):
+ _VALID_URL = r'''(?x)http://(?:www\.)?(?:
+ videolectures\.net|
+ flexilearn\.viidea\.net|
+ presentations\.ocwconsortium\.org|
+ video\.travel-zoom\.si|
+ video\.pomp-forum\.si|
+ tv\.nil\.si|
+ video\.hekovnik.com|
+ video\.szko\.si|
+ kpk\.viidea\.com|
+ inside\.viidea\.net|
+ video\.kiberpipa\.org|
+ bvvideo\.si|
+ kongres\.viidea\.net|
+ edemokracija\.viidea\.com
+ )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$'''
+
+ _TESTS = [{
+ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
+ 'info_dict': {
+ 'id': '20171',
+ 'display_id': 'promogram_igor_mekjavic_eng',
+ 'ext': 'mp4',
+ 'title': 'Automatics, robotics and biocybernetics',
+ 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1372349289,
+ 'upload_date': '20130627',
+ 'duration': 565,
+ },
+ }, {
+ # video with invalid direct format links (HTTP 403)
+ 'url': 'http://videolectures.net/russir2010_filippova_nlp/',
+ 'info_dict': {
+ 'id': '14891',
+ 'display_id': 'russir2010_filippova_nlp',
+ 'ext': 'flv',
+ 'title': 'NLP at Google',
+ 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1284375600,
+ 'upload_date': '20100913',
+ 'duration': 5352,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ # event playlist
+ 'url': 'http://videolectures.net/deeplearning2015_montreal/',
+ 'info_dict': {
+ 'id': '23181',
+ 'title': 'Deep Learning Summer School, Montreal 2015',
+ 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1438560000,
+ },
+ 'playlist_count': 30,
+ }, {
+ # multi part lecture
+ 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
+ 'info_dict': {
+ 'id': '9737',
+ 'display_id': 'mlss09uk_bishop_ibi',
+ 'title': 'Introduction To Bayesian Inference',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'timestamp': 1251622800,
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '9737_part1',
+ 'display_id': 'mlss09uk_bishop_ibi_part1',
+ 'ext': 'wmv',
+ 'title': 'Introduction To Bayesian Inference (Part 1)',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'duration': 4622,
+ 'timestamp': 1251622800,
+ 'upload_date': '20090830',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '9737_part2',
+ 'display_id': 'mlss09uk_bishop_ibi_part2',
+ 'ext': 'wmv',
+ 'title': 'Introduction To Bayesian Inference (Part 2)',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'duration': 5641,
+ 'timestamp': 1251622800,
+ 'upload_date': '20090830',
+ },
+ }],
+ 'playlist_count': 2,
+ }]
+
+ def _real_extract(self, url):
+ lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups()
+
+ webpage = self._download_webpage(url, lecture_slug)
+
+ cfg = self._parse_json(self._search_regex(
+ [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function',
+ r'cfg\s*:\s*({[^}]+})'],
+ webpage, 'cfg'), lecture_slug, js_to_json)
+
+ lecture_id = compat_str(cfg['obj_id'])
+
+ base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
+
+ lecture_data = self._download_json(
+ '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
+ lecture_id)['lecture'][0]
+
+ lecture_info = {
+ 'id': lecture_id,
+ 'display_id': lecture_slug,
+ 'title': lecture_data['title'],
+ 'timestamp': parse_iso8601(lecture_data.get('time')),
+ 'description': lecture_data.get('description_wiki'),
+ 'thumbnail': lecture_data.get('thumb'),
+ }
+
+ playlist_entries = []
+ lecture_type = lecture_data.get('type')
+ parts = [compat_str(video) for video in cfg.get('videos', [])]
+ if parts:
+ multipart = len(parts) > 1
+
+ def extract_part(part_id):
+ smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id)
+ smil = self._download_smil(smil_url, lecture_id)
+ info = self._parse_smil(smil, smil_url, lecture_id)
+ info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id)
+ info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id)
+ if multipart:
+ info['title'] += ' (Part %s)' % part_id
+ switch = smil.find('.//switch')
+ if switch is not None:
+ info['duration'] = parse_duration(switch.attrib.get('dur'))
+ item_info = lecture_info.copy()
+ item_info.update(info)
+ return item_info
+
+ if explicit_part_id or not multipart:
+ result = extract_part(explicit_part_id or parts[0])
+ else:
+ result = {
+ '_type': 'multi_video',
+ 'entries': [extract_part(part) for part in parts],
+ }
+ result.update(lecture_info)
+
+ # Immediately return explicitly requested part or non event item
+ if explicit_part_id or lecture_type != 'evt':
+ return result
+
+ playlist_entries.append(result)
+
+ # It's probably a playlist
+ if not parts or lecture_type == 'evt':
+ playlist_webpage = self._download_webpage(
+ '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id)
+ entries = [
+ self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea')
+ for _, video_url in re.findall(
+ r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)]
+ playlist_entries.extend(entries)
+
+ playlist = self.playlist_result(playlist_entries, lecture_id)
+ playlist.update(lecture_info)
+ return playlist
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 944901e14..a63c23617 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -1,21 +1,106 @@
+# coding: utf-8
from __future__ import unicode_literals
-import re
+import json
+import time
+import hmac
+import hashlib
+import itertools
+from .common import InfoExtractor
from ..utils import (
ExtractorError,
- unescapeHTML,
- unified_strdate,
- US_RATINGS,
+ int_or_none,
+ parse_age_limit,
+ parse_iso8601,
+ sanitized_Request,
)
-from .subtitles import SubtitlesInfoExtractor
-class VikiIE(SubtitlesInfoExtractor):
- IE_NAME = 'viki'
+class VikiBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+ _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+ _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s'
+
+ _APP = '65535a'
+ _APP_VERSION = '2.2.5.1428709186'
+ _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)'
+
+ _NETRC_MACHINE = 'viki'
+
+ _token = None
+
+ def _prepare_call(self, path, timestamp=None, post_data=None):
+ path += '?' if '?' not in path else '&'
+ if not timestamp:
+ timestamp = int(time.time())
+ query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+ if self._token:
+ query += '&token=%s' % self._token
+ sig = hmac.new(
+ self._APP_SECRET.encode('ascii'),
+ query.encode('ascii'),
+ hashlib.sha1
+ ).hexdigest()
+ url = self._API_URL_TEMPLATE % (query, sig)
+ return sanitized_Request(
+ url, json.dumps(post_data).encode('utf-8')) if post_data else url
+
+ def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
+ resp = self._download_json(
+ self._prepare_call(path, timestamp, post_data), video_id, note)
+
+ error = resp.get('error')
+ if error:
+ if error == 'invalid timestamp':
+ resp = self._download_json(
+ self._prepare_call(path, int(resp['current_timestamp']), post_data),
+ video_id, '%s (retry)' % note)
+ error = resp.get('error')
+ if error:
+ self._raise_error(resp['error'])
+
+ return resp
+
+ def _raise_error(self, error):
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error),
+ expected=True)
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_form = {
+ 'login_id': username,
+ 'password': password,
+ }
+
+ login = self._call_api(
+ 'sessions.json', None,
+ 'Logging in as %s' % username, post_data=login_form)
+
+ self._token = login.get('token')
+ if not self._token:
+ self.report_warning('Unable to get session token, login has probably failed')
- _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
- _TEST = {
+ @staticmethod
+ def dict_selection(dict_obj, preferred_key):
+ if preferred_key in dict_obj:
+ return dict_obj.get(preferred_key)
+
+ filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
+ return filtered_dict[0] if filtered_dict else None
+
+
+class VikiIE(VikiBaseIE):
+ IE_NAME = 'viki'
+ _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
+ _TESTS = [{
'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
'info_dict': {
'id': '1023585v',
@@ -27,70 +112,225 @@ class VikiIE(SubtitlesInfoExtractor):
'age_limit': 13,
},
'skip': 'Blocked in the US',
- }
+ }, {
+ # clip
+ 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
+ 'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
+ 'info_dict': {
+ 'id': '1067139v',
+ 'ext': 'mp4',
+ 'title': "'The Avengers: Age of Ultron' Press Conference",
+ 'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+ 'duration': 352,
+ 'timestamp': 1430380829,
+ 'upload_date': '20150430',
+ 'uploader': 'Arirang TV',
+ 'like_count': int,
+ 'age_limit': 0,
+ }
+ }, {
+ 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
+ 'info_dict': {
+ 'id': '1048879v',
+ 'ext': 'mp4',
+ 'title': 'Ankhon Dekhi',
+ 'duration': 6512,
+ 'timestamp': 1408532356,
+ 'upload_date': '20140820',
+ 'uploader': 'Spuul',
+ 'like_count': int,
+ 'age_limit': 13,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ # episode
+ 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+ 'md5': '190f3ef426005ba3a080a63325955bc3',
+ 'info_dict': {
+ 'id': '44699v',
+ 'ext': 'mp4',
+ 'title': 'Boys Over Flowers - Episode 1',
+ 'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
+ 'duration': 4155,
+ 'timestamp': 1270496524,
+ 'upload_date': '20100405',
+ 'uploader': 'group8',
+ 'like_count': int,
+ 'age_limit': 13,
+ }
+ }, {
+ # youtube external
+ 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+ 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+ 'info_dict': {
+ 'id': '50562v',
+ 'ext': 'mp4',
+ 'title': 'Poor Nastya [COMPLETE] - Episode 1',
+ 'description': '',
+ 'duration': 607,
+ 'timestamp': 1274949505,
+ 'upload_date': '20101213',
+ 'uploader': 'ad14065n',
+ 'uploader_id': 'ad14065n',
+ 'like_count': int,
+ 'age_limit': 13,
+ }
+ }, {
+ 'url': 'http://www.viki.com/player/44699v',
+ 'only_matching': True,
+ }, {
+ # non-English description
+ 'url': 'http://www.viki.com/videos/158036v-love-in-magic',
+ 'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+ 'info_dict': {
+ 'id': '158036v',
+ 'ext': 'mp4',
+ 'uploader': 'I Planet Entertainment',
+ 'upload_date': '20111122',
+ 'timestamp': 1321985454,
+ 'description': 'md5:44b1e46619df3a072294645c770cef36',
+ 'title': 'Love In Magic',
+ },
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
- uploader_m = re.search(
- r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
- if uploader_m is None:
- uploader = None
- else:
- uploader = uploader_m.group(1).strip()
-
- rating_str = self._html_search_regex(
- r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
- 'rating information', default='').strip()
- age_limit = US_RATINGS.get(rating_str)
-
- info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
- info_webpage = self._download_webpage(
- info_url, video_id, note='Downloading info page')
- if re.match(r'\s*<div\s+class="video-error', info_webpage):
- raise ExtractorError(
- 'Video %s is blocked from your location.' % video_id,
- expected=True)
- video_url = self._html_search_regex(
- r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL')
-
- upload_date_str = self._html_search_regex(
- r'"created_at":"([^"]+)"', info_webpage, 'upload date')
- upload_date = (
- unified_strdate(upload_date_str)
- if upload_date_str is not None
- else None
- )
-
- # subtitles
- video_subtitles = self.extract_subtitles(video_id, info_webpage)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, info_webpage)
- return
+ video = self._call_api(
+ 'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
- return {
+ title = self.dict_selection(video.get('titles', {}), 'en')
+ if not title:
+ title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+ container_titles = video.get('container', {}).get('titles', {})
+ container_title = self.dict_selection(container_titles, 'en')
+ title = '%s - %s' % (container_title, title)
+
+ description = self.dict_selection(video.get('descriptions', {}), 'en')
+
+ duration = int_or_none(video.get('duration'))
+ timestamp = parse_iso8601(video.get('created_at'))
+ uploader = video.get('author')
+ like_count = int_or_none(video.get('likes', {}).get('count'))
+ age_limit = parse_age_limit(video.get('rating'))
+
+ thumbnails = []
+ for thumbnail_id, thumbnail in video.get('images', {}).items():
+ thumbnails.append({
+ 'id': thumbnail_id,
+ 'url': thumbnail.get('url'),
+ })
+
+ subtitles = {}
+ for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+ subtitles[subtitle_lang] = [{
+ 'ext': subtitles_format,
+ 'url': self._prepare_call(
+ 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+ } for subtitles_format in ('srt', 'vtt')]
+
+ result = {
'id': video_id,
'title': title,
- 'url': video_url,
'description': description,
- 'thumbnail': thumbnail,
- 'age_limit': age_limit,
+ 'duration': duration,
+ 'timestamp': timestamp,
'uploader': uploader,
- 'subtitles': video_subtitles,
- 'upload_date': upload_date,
+ 'like_count': like_count,
+ 'age_limit': age_limit,
+ 'thumbnails': thumbnails,
+ 'subtitles': subtitles,
}
- def _get_available_subtitles(self, video_id, info_webpage):
- res = {}
- for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage):
- sturl = unescapeHTML(sturl_html)
- m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
- if not m:
- continue
- res[m.group('lang')] = sturl
- return res
+ streams = self._call_api(
+ 'videos/%s/streams.json' % video_id, video_id,
+ 'Downloading video streams JSON')
+
+ if 'external' in streams:
+ result.update({
+ '_type': 'url_transparent',
+ 'url': streams['external']['url'],
+ })
+ return result
+
+ formats = []
+ for format_id, stream_dict in streams.items():
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None))
+ for protocol, format_dict in stream_dict.items():
+ if format_id == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol)
+ else:
+ formats.append({
+ 'url': format_dict['url'],
+ 'format_id': '%s-%s' % (format_id, protocol),
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ result['formats'] = formats
+ return result
+
+
+class VikiChannelIE(VikiBaseIE):
+ IE_NAME = 'viki:channel'
+ _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+ _TESTS = [{
+ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+ 'info_dict': {
+ 'id': '50c',
+ 'title': 'Boys Over Flowers',
+ 'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+ },
+ 'playlist_count': 70,
+ }, {
+ 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+ 'info_dict': {
+ 'id': '1354c',
+ 'title': 'Poor Nastya [COMPLETE]',
+ 'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+ },
+ 'playlist_count': 127,
+ }, {
+ 'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.viki.com/artists/2141c-shinee',
+ 'only_matching': True,
+ }]
+
+ _PER_PAGE = 25
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+
+ channel = self._call_api(
+ 'containers/%s.json' % channel_id, channel_id,
+ 'Downloading channel JSON')
+
+ title = self.dict_selection(channel['titles'], 'en')
+
+ description = self.dict_selection(channel['descriptions'], 'en')
+
+ entries = []
+ for video_type in ('episodes', 'clips', 'movies'):
+ for page_num in itertools.count(1):
+ page = self._call_api(
+ 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+ % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+ 'Downloading %s JSON page #%d' % (video_type, page_num))
+ for video in page['response']:
+ video_id = video['id']
+ entries.append(self.url_result(
+ 'http://www.viki.com/videos/%s' % video_id, 'Viki'))
+ if not page['pagination']['next']:
+ break
+
+ return self.playlist_result(entries, channel_id, title, description)
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 303e81447..f392ccf1c 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -6,27 +6,30 @@ import re
import itertools
from .common import InfoExtractor
-from .subtitles import SubtitlesInfoExtractor
from ..compat import (
compat_HTTPError,
- compat_urllib_parse,
- compat_urllib_request,
compat_urlparse,
)
from ..utils import (
+ encode_dict,
ExtractorError,
InAdvancePagedList,
int_or_none,
RegexNotFoundError,
+ sanitized_Request,
+ smuggle_url,
std_headers,
+ unified_strdate,
unsmuggle_url,
urlencode_postdata,
+ unescapeHTML,
)
class VimeoBaseInfoExtractor(InfoExtractor):
_NETRC_MACHINE = 'vimeo'
_LOGIN_REQUIRED = False
+ _LOGIN_URL = 'https://vimeo.com/log_in'
def _login(self):
(username, password) = self._get_login_info()
@@ -35,23 +38,35 @@ class VimeoBaseInfoExtractor(InfoExtractor):
raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return
self.report_login()
- login_url = 'https://vimeo.com/log_in'
- webpage = self._download_webpage(login_url, None, False)
- token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
- data = urlencode_postdata({
+ webpage = self._download_webpage(self._LOGIN_URL, None, False)
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ data = urlencode_postdata(encode_dict({
+ 'action': 'login',
'email': username,
'password': password,
- 'action': 'login',
'service': 'vimeo',
'token': token,
- })
- login_request = compat_urllib_request.Request(login_url, data)
+ }))
+ login_request = sanitized_Request(self._LOGIN_URL, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- login_request.add_header('Cookie', 'xsrft=%s' % token)
+ login_request.add_header('Referer', self._LOGIN_URL)
+ self._set_vimeo_cookie('vuid', vuid)
self._download_webpage(login_request, None, False, 'Wrong login info')
+ def _extract_xsrft_and_vuid(self, webpage):
+ xsrft = self._search_regex(
+ r'xsrft\s*[=:]\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
+ webpage, 'login token', group='xsrft')
+ vuid = self._search_regex(
+ r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1',
+ webpage, 'vuid', group='vuid')
+ return xsrft, vuid
+
+ def _set_vimeo_cookie(self, name, value):
+ self._set_cookie('vimeo.com', name, value)
-class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
+
+class VimeoIE(VimeoBaseInfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
@@ -73,12 +88,12 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'info_dict': {
'id': '56015672',
'ext': 'mp4',
- "upload_date": "20121220",
- "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- "uploader_id": "user7108434",
- "uploader": "Filippo Valsorda",
- "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- "duration": 10,
+ 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ 'description': 'md5:2d3305bad981a06ff79f027f19865021',
+ 'upload_date': '20121220',
+ 'uploader_id': 'user7108434',
+ 'uploader': 'Filippo Valsorda',
+ 'duration': 10,
},
},
{
@@ -91,7 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
- 'description': 'md5:380943ec71b89736ff4bf27183233d09',
+ 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30',
'duration': 1595,
},
},
@@ -121,7 +136,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
- 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people who love them.',
+ 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026',
},
'params': {
'videopassword': 'youtube-dl',
@@ -139,12 +154,12 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'description': 'md5:8678b246399b070816b12313e8b4eb5c',
'uploader_id': 'atencio',
'uploader': 'Peter Atencio',
+ 'upload_date': '20130927',
'duration': 187,
},
},
{
'url': 'http://vimeo.com/76979871',
- 'md5': '3363dd6ffebe3784d56f4132317fd446',
'note': 'Video with subtitles',
'info_dict': {
'id': '76979871',
@@ -169,25 +184,48 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader_id': 'user28849593',
},
},
+ {
+ 'url': 'https://vimeo.com/109815029',
+ 'note': 'Video not completely processed, "failed" seed status',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
+ 'only_matching': True,
+ },
]
+ @staticmethod
+ def _extract_vimeo_url(url, webpage):
+ # Look for embedded (iframe) Vimeo player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
+ if mobj:
+ player_url = unescapeHTML(mobj.group('url'))
+ surl = smuggle_url(player_url, {'Referer': url})
+ return surl
+ # Look for embedded (swf embed) Vimeo player
+ mobj = re.search(
+ r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
+ if mobj:
+ return mobj.group(1)
+
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
- raise ExtractorError('This video is protected by a password, use the --video-password option')
- token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
- data = compat_urllib_parse.urlencode({
+ raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ data = urlencode_postdata(encode_dict({
'password': password,
'token': token,
- })
- # I didn't manage to use the password with https
- if url.startswith('https'):
- pass_url = url.replace('https', 'http')
- else:
- pass_url = url
- password_request = compat_urllib_request.Request(pass_url + '/password', data)
+ }))
+ if url.startswith('http://'):
+ # vimeo only supports https now, but the user can give an http url
+ url = url.replace('http://', 'https://')
+ password_request = sanitized_Request(url + '/password', data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- password_request.add_header('Cookie', 'xsrft=%s' % token)
+ password_request.add_header('Referer', url)
+ self._set_vimeo_cookie('vuid', vuid)
return self._download_webpage(
password_request, video_id,
'Verifying the password', 'Wrong password')
@@ -196,9 +234,9 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option')
- data = compat_urllib_parse.urlencode({'password': password})
+ data = urlencode_postdata(encode_dict({'password': password}))
pass_url = url + '/check-password'
- password_request = compat_urllib_request.Request(pass_url, data)
+ password_request = sanitized_Request(pass_url, data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
return self._download_json(
password_request, video_id,
@@ -222,10 +260,12 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
video_id = mobj.group('id')
orig_url = url
if mobj.group('pro') or mobj.group('player'):
- url = 'http://player.vimeo.com/video/' + video_id
+ url = 'https://player.vimeo.com/video/' + video_id
+ else:
+ url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
- request = compat_urllib_request.Request(url, None, headers)
+ request = sanitized_Request(url, None, headers)
try:
webpage = self._download_webpage(request, video_id)
except ExtractorError as ee:
@@ -244,11 +284,31 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
# and latter we extract those that are Vimeo specific.
self.report_extraction(video_id)
+ vimeo_config = self._search_regex(
+ r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', webpage,
+ 'vimeo config', default=None)
+ if vimeo_config:
+ seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {})
+ if seed_status.get('state') == 'failed':
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, seed_status['title']),
+ expected=True)
+
# Extract the config JSON
try:
try:
config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage, 'config URL')
+ r' data-config-url="(.+?)"', webpage,
+ 'config URL', default=None)
+ if not config_url:
+ # Sometimes new react-based page is served instead of old one that require
+ # different config URL extraction approach (see
+ # https://github.com/rg3/youtube-dl/pull/7209)
+ vimeo_clip_page_config = self._search_regex(
+ r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage,
+ 'vimeo clip page config')
+ config_url = self._parse_json(
+ vimeo_clip_page_config, video_id)['player']['config_url']
config_json = self._download_webpage(config_url, video_id)
config = json.loads(config_json)
except RegexNotFoundError:
@@ -267,8 +327,11 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
+ if data and '_video_password_verified' in data:
+ raise ExtractorError('video password verification failed!')
self._verify_video_password(url, video_id, webpage)
- return self._real_extract(url)
+ return self._real_extract(
+ smuggle_url(url, {'_video_password_verified': 'verified'}))
else:
raise ExtractorError('Unable to extract info section',
cause=e)
@@ -314,9 +377,9 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
# Extract upload date
video_upload_date = None
- mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
+ mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage)
if mobj is not None:
- video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
+ video_upload_date = unified_strdate(mobj.group(1))
try:
view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
@@ -328,52 +391,38 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
like_count = None
comment_count = None
- # Vimeo specific: extract request signature and timestamp
- sig = config['request']['signature']
- timestamp = config['request']['timestamp']
-
- # Vimeo specific: extract video codec and quality information
- # First consider quality, then codecs, then take everything
- codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')]
- files = {'hd': [], 'sd': [], 'other': []}
- config_files = config["video"].get("files") or config["request"].get("files")
- for codec_name, codec_extension in codecs:
- for quality in config_files.get(codec_name, []):
- format_id = '-'.join((codec_name, quality)).lower()
- key = quality if quality in files else 'other'
- video_url = None
- if isinstance(config_files[codec_name], dict):
- file_info = config_files[codec_name][quality]
- video_url = file_info.get('url')
- else:
- file_info = {}
- if video_url is None:
- video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
- % (video_id, sig, timestamp, quality, codec_name.upper())
-
- files[key].append({
- 'ext': codec_extension,
- 'url': video_url,
- 'format_id': format_id,
- 'width': file_info.get('width'),
- 'height': file_info.get('height'),
- })
formats = []
- for key in ('other', 'sd', 'hd'):
- formats += files[key]
- if len(formats) == 0:
- raise ExtractorError('No known codec found')
+ config_files = config['video'].get('files') or config['request'].get('files', {})
+ for f in config_files.get('progressive', []):
+ video_url = f.get('url')
+ if not video_url:
+ continue
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'http-%s' % f.get('quality'),
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'fps': int_or_none(f.get('fps')),
+ 'tbr': int_or_none(f.get('bitrate')),
+ })
+ m3u8_url = config_files.get('hls', {}).get('url')
+ if m3u8_url:
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', 0, 'hls', fatal=False)
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
+ # at the same time without actual units specified. This lead to wrong sorting.
+ self._sort_formats(formats, field_preference=('height', 'width', 'fps', 'format_id'))
subtitles = {}
text_tracks = config['request'].get('text_tracks')
if text_tracks:
for tt in text_tracks:
- subtitles[tt['lang']] = 'http://vimeo.com' + tt['url']
-
- video_subtitles = self.extract_subtitles(video_id, subtitles)
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
+ subtitles[tt['lang']] = [{
+ 'ext': 'vtt',
+ 'url': 'https://vimeo.com' + tt['url'],
+ }]
return {
'id': video_id,
@@ -389,18 +438,20 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
'view_count': view_count,
'like_count': like_count,
'comment_count': comment_count,
- 'subtitles': video_subtitles,
+ 'subtitles': subtitles,
}
-class VimeoChannelIE(InfoExtractor):
+class VimeoChannelIE(VimeoBaseInfoExtractor):
IE_NAME = 'vimeo:channel'
- _VALID_URL = r'https?://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
+ _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+ _TITLE = None
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
_TESTS = [{
- 'url': 'http://vimeo.com/channels/tributes',
+ 'url': 'https://vimeo.com/channels/tributes',
'info_dict': {
+ 'id': 'tributes',
'title': 'Vimeo Tributes',
},
'playlist_mincount': 25,
@@ -410,7 +461,7 @@ class VimeoChannelIE(InfoExtractor):
return '%s/videos/page:%d/' % (base_url, pagenum)
def _extract_list_title(self, webpage):
- return self._html_search_regex(self._TITLE_RE, webpage, 'list title')
+ return self._TITLE or self._html_search_regex(self._TITLE_RE, webpage, 'list title')
def _login_list_password(self, page_url, list_id, webpage):
login_form = self._search_regex(
@@ -422,28 +473,24 @@ class VimeoChannelIE(InfoExtractor):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- value="([^"]*)"
- ''', login_form))
- token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
+ fields = self._hidden_inputs(login_form)
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
fields['token'] = token
fields['password'] = password
- post = compat_urllib_parse.urlencode(fields)
+ post = urlencode_postdata(encode_dict(fields))
password_path = self._search_regex(
r'action="([^"]+)"', login_form, 'password URL')
password_url = compat_urlparse.urljoin(page_url, password_path)
- password_request = compat_urllib_request.Request(password_url, post)
+ password_request = sanitized_Request(password_url, post)
password_request.add_header('Content-type', 'application/x-www-form-urlencoded')
- self._set_cookie('vimeo.com', 'xsrft', token)
+ self._set_vimeo_cookie('vuid', vuid)
+ self._set_vimeo_cookie('xsrft', token)
return self._download_webpage(
password_request, list_id,
'Verifying the password', 'Wrong password')
- def _extract_videos(self, list_id, base_url):
- video_ids = []
+ def _title_and_entries(self, list_id, base_url):
for pagenum in itertools.count(1):
page_url = self._page_url(base_url, pagenum)
webpage = self._download_webpage(
@@ -452,33 +499,34 @@ class VimeoChannelIE(InfoExtractor):
if pagenum == 1:
webpage = self._login_list_password(page_url, list_id, webpage)
+ yield self._extract_list_title(webpage)
+
+ for video_id in re.findall(r'id="clip_(\d+?)"', webpage):
+ yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
- video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
break
- entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
- for video_id in video_ids]
- return {'_type': 'playlist',
- 'id': list_id,
- 'title': self._extract_list_title(webpage),
- 'entries': entries,
- }
+ def _extract_videos(self, list_id, base_url):
+ title_and_entries = self._title_and_entries(list_id, base_url)
+ list_title = next(title_and_entries)
+ return self.playlist_result(title_and_entries, list_id, list_title)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
channel_id = mobj.group('id')
- return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
+ return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id)
class VimeoUserIE(VimeoChannelIE):
IE_NAME = 'vimeo:user'
- _VALID_URL = r'https?://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
+ _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
_TESTS = [{
- 'url': 'http://vimeo.com/nkistudio/videos',
+ 'url': 'https://vimeo.com/nkistudio/videos',
'info_dict': {
'title': 'Nki',
+ 'id': 'nkistudio',
},
'playlist_mincount': 66,
}]
@@ -486,16 +534,17 @@ class VimeoUserIE(VimeoChannelIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
- return self._extract_videos(name, 'http://vimeo.com/%s' % name)
+ return self._extract_videos(name, 'https://vimeo.com/%s' % name)
class VimeoAlbumIE(VimeoChannelIE):
IE_NAME = 'vimeo:album'
- _VALID_URL = r'https?://vimeo\.com/album/(?P<id>\d+)'
+ _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
_TESTS = [{
- 'url': 'http://vimeo.com/album/2632481',
+ 'url': 'https://vimeo.com/album/2632481',
'info_dict': {
+ 'id': '2632481',
'title': 'Staff Favorites: November 2013',
},
'playlist_mincount': 13,
@@ -517,15 +566,16 @@ class VimeoAlbumIE(VimeoChannelIE):
def _real_extract(self, url):
album_id = self._match_id(url)
- return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id)
+ return self._extract_videos(album_id, 'https://vimeo.com/album/%s' % album_id)
class VimeoGroupsIE(VimeoAlbumIE):
IE_NAME = 'vimeo:group'
- _VALID_URL = r'(?:https?://)?vimeo\.com/groups/(?P<name>[^/]+)'
+ _VALID_URL = r'https://vimeo\.com/groups/(?P<name>[^/]+)(?:/(?!videos?/\d+)|$)'
_TESTS = [{
- 'url': 'http://vimeo.com/groups/rolexawards',
+ 'url': 'https://vimeo.com/groups/rolexawards',
'info_dict': {
+ 'id': 'rolexawards',
'title': 'Rolex Awards for Enterprise',
},
'playlist_mincount': 73,
@@ -537,13 +587,13 @@ class VimeoGroupsIE(VimeoAlbumIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
- return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name)
+ return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name)
class VimeoReviewIE(InfoExtractor):
IE_NAME = 'vimeo:review'
IE_DESC = 'Review pages on vimeo'
- _VALID_URL = r'https?://vimeo\.com/[^/]+/review/(?P<id>[^/]+)'
+ _VALID_URL = r'https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)'
_TESTS = [{
'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',
'md5': 'c507a72f780cacc12b2248bb4006d253',
@@ -555,7 +605,7 @@ class VimeoReviewIE(InfoExtractor):
}
}, {
'note': 'video player needs Referer',
- 'url': 'http://vimeo.com/user22258446/review/91613211/13f927e053',
+ 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',
'md5': '6295fdab8f4bf6a002d058b2c6dce276',
'info_dict': {
'id': '91613211',
@@ -574,14 +624,14 @@ class VimeoReviewIE(InfoExtractor):
return self.url_result(player_url, 'Vimeo', video_id)
-class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
+class VimeoWatchLaterIE(VimeoChannelIE):
IE_NAME = 'vimeo:watchlater'
IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)'
- _VALID_URL = r'https?://vimeo\.com/home/watchlater|:vimeowatchlater'
+ _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater'
+ _TITLE = 'Watch Later'
_LOGIN_REQUIRED = True
- _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<'
_TESTS = [{
- 'url': 'http://vimeo.com/home/watchlater',
+ 'url': 'https://vimeo.com/watchlater',
'only_matching': True,
}]
@@ -590,24 +640,25 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
def _page_url(self, base_url, pagenum):
url = '%s/page:%d/' % (base_url, pagenum)
- request = compat_urllib_request.Request(url)
+ request = sanitized_Request(url)
# Set the header to get a partial html page with the ids,
# the normal page doesn't contain them.
request.add_header('X-Requested-With', 'XMLHttpRequest')
return request
def _real_extract(self, url):
- return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater')
+ return self._extract_videos('watchlater', 'https://vimeo.com/watchlater')
class VimeoLikesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
+ _VALID_URL = r'https://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)'
IE_NAME = 'vimeo:likes'
IE_DESC = 'Vimeo user likes'
_TEST = {
'url': 'https://vimeo.com/user755559/likes/',
'playlist_mincount': 293,
"info_dict": {
+ 'id': 'user755559_likes',
"description": "See all the videos urza likes",
"title": 'Videos urza likes',
},
@@ -628,8 +679,8 @@ class VimeoLikesIE(InfoExtractor):
description = self._html_search_meta('description', webpage)
def _get_page(idx):
- page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % (
- self.http_scheme(), user_id, idx + 1)
+ page_url = 'https://vimeo.com/user%s/likes/page:%d/sort:date' % (
+ user_id, idx + 1)
webpage = self._download_webpage(
page_url, user_id,
note='Downloading page %d/%d' % (idx + 1, page_count))
diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py
index ee3d86117..92321d66e 100644
--- a/youtube_dl/extractor/vimple.py
+++ b/youtube_dl/extractor/vimple.py
@@ -1,75 +1,60 @@
-# coding: utf-8
from __future__ import unicode_literals
-import base64
-import re
-import xml.etree.ElementTree
-import zlib
-
from .common import InfoExtractor
from ..utils import int_or_none
-class VimpleIE(InfoExtractor):
- IE_DESC = 'Vimple.ru'
- _VALID_URL = r'https?://(player.vimple.ru/iframe|vimple.ru)/(?P<id>[a-f0-9]{10,})'
+class SprutoBaseIE(InfoExtractor):
+ def _extract_spruto(self, spruto, video_id):
+ playlist = spruto['playlist'][0]
+ title = playlist['title']
+ video_id = playlist.get('videoId') or video_id
+ thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl')
+ duration = int_or_none(playlist.get('duration'))
+
+ formats = [{
+ 'url': f['url'],
+ } for f in playlist['video']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
+
+
+class VimpleIE(SprutoBaseIE):
+ IE_DESC = 'Vimple - one-click video hosting'
+ _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P<id>[\da-f-]{32,36})'
_TESTS = [
{
'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf',
'md5': '2e750a330ed211d3fd41821c6ad9a279',
'info_dict': {
- 'id': 'c0f6b1687dcd4000a97ebe70068039cf',
+ 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf',
'ext': 'mp4',
'title': 'Sunset',
'duration': 20,
'thumbnail': 're:https?://.*?\.jpg',
},
- },
+ }, {
+ 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- iframe_url = 'http://player.vimple.ru/iframe/%s' % video_id
+ video_id = self._match_id(url)
- iframe = self._download_webpage(
- iframe_url, video_id,
- note='Downloading iframe', errnote='unable to fetch iframe')
- player_url = self._html_search_regex(
- r'"(http://player.vimple.ru/flash/.+?)"', iframe, 'player url')
+ webpage = self._download_webpage(
+ 'http://player.vimple.ru/iframe/%s' % video_id, video_id)
- player = self._request_webpage(
- player_url, video_id, note='Downloading swf player').read()
+ spruto = self._parse_json(
+ self._search_regex(
+ r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'),
+ video_id)
- player = zlib.decompress(player[8:])
-
- xml_pieces = re.findall(b'([a-zA-Z0-9 =+/]{500})', player)
- xml_pieces = [piece[1:-1] for piece in xml_pieces]
-
- xml_data = b''.join(xml_pieces)
- xml_data = base64.b64decode(xml_data)
-
- xml_data = xml.etree.ElementTree.fromstring(xml_data)
-
- video = xml_data.find('Video')
- quality = video.get('quality')
- q_tag = video.find(quality.capitalize())
-
- formats = [
- {
- 'url': q_tag.get('url'),
- 'tbr': int(q_tag.get('bitrate')),
- 'filesize': int(q_tag.get('filesize')),
- 'format_id': quality,
- },
- ]
-
- return {
- 'id': video_id,
- 'title': video.find('Title').text,
- 'formats': formats,
- 'thumbnail': video.find('Poster').get('url'),
- 'duration': int_or_none(video.get('duration')),
- 'webpage_url': video.find('Share').get('videoPageUrl'),
- }
+ return self._extract_spruto(spruto, video_id)
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index 0b58fe0fe..cb2a4b0b5 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -1,16 +1,19 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
-import json
import itertools
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
class VineIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?vine\.co/v/(?P<id>\w+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P<id>\w+)'
+ _TESTS = [{
'url': 'https://vine.co/v/b9KOOWX7HUx',
'md5': '2f36fed6235b16da96ce9b4dc890940d',
'info_dict': {
@@ -18,42 +21,97 @@ class VineIE(InfoExtractor):
'ext': 'mp4',
'title': 'Chicken.',
'alt_title': 'Vine by Jack Dorsey',
- 'description': 'Chicken.',
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
'uploader_id': '76',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
},
- }
+ }, {
+ 'url': 'https://vine.co/v/MYxVapFvz2z',
+ 'md5': '7b9a7cbc76734424ff942eb52c8f1065',
+ 'info_dict': {
+ 'id': 'MYxVapFvz2z',
+ 'ext': 'mp4',
+ 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14',
+ 'alt_title': 'Vine by Mars Ruiz',
+ 'upload_date': '20140815',
+ 'uploader': 'Mars Ruiz',
+ 'uploader_id': '1102363502380728320',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
+ }, {
+ 'url': 'https://vine.co/v/bxVjBbZlPUH',
+ 'md5': 'ea27decea3fa670625aac92771a96b73',
+ 'info_dict': {
+ 'id': 'bxVjBbZlPUH',
+ 'ext': 'mp4',
+ 'title': '#mw3 #ac130 #killcam #angelofdeath',
+ 'alt_title': 'Vine by Z3k3',
+ 'upload_date': '20130430',
+ 'uploader': 'Z3k3',
+ 'uploader_id': '936470460173008896',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
+ }, {
+ 'url': 'https://vine.co/oembed/MYxVapFvz2z.json',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vine.co/v/e192BnZnZ9V',
+ 'info_dict': {
+ 'id': 'e192BnZnZ9V',
+ 'ext': 'mp4',
+ 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2',
+ 'alt_title': 'Vine by Pimry_zaa',
+ 'upload_date': '20150705',
+ 'uploader': 'Pimry_zaa',
+ 'uploader_id': '1135760698325307392',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
- data = json.loads(self._html_search_regex(
- r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
+ data = self._parse_json(
+ self._search_regex(
+ r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id,
+ webpage, 'vine data'),
+ video_id)
formats = [{
- 'url': data['videoLowURL'],
- 'ext': 'mp4',
- 'format_id': 'low',
- }, {
- 'url': data['videoUrl'],
- 'ext': 'mp4',
- 'format_id': 'standard',
- }]
+ 'format_id': '%(format)s-%(rate)s' % f,
+ 'vcodec': f.get('format'),
+ 'quality': f.get('rate'),
+ 'url': f['videoUrl'],
+ } for f in data['videoUrls'] if f.get('videoUrl')]
+
+ self._sort_formats(formats)
+
+ username = data.get('username')
return {
'id': video_id,
- 'title': self._og_search_title(webpage),
- 'alt_title': self._og_search_description(webpage),
- 'description': data['description'],
- 'thumbnail': data['thumbnailUrl'],
- 'upload_date': unified_strdate(data['created']),
- 'uploader': data['username'],
- 'uploader_id': data['userIdStr'],
- 'like_count': data['likes']['count'],
- 'comment_count': data['comments']['count'],
- 'repost_count': data['reposts']['count'],
+ 'title': data.get('description') or self._og_search_title(webpage),
+ 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None),
+ 'thumbnail': data.get('thumbnailUrl'),
+ 'upload_date': unified_strdate(data.get('created')),
+ 'uploader': username,
+ 'uploader_id': data.get('userIdStr'),
+ 'like_count': int_or_none(data.get('likes', {}).get('count')),
+ 'comment_count': int_or_none(data.get('comments', {}).get('count')),
+ 'repost_count': int_or_none(data.get('reposts', {}).get('count')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 81e02a624..d99a42a9f 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -8,19 +8,32 @@ from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urllib_parse,
- compat_urllib_request,
)
from ..utils import (
ExtractorError,
orderedSet,
+ sanitized_Request,
+ str_to_int,
unescapeHTML,
unified_strdate,
)
+from .vimeo import VimeoIE
class VKIE(InfoExtractor):
- IE_NAME = 'vk.com'
- _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:.+?\?.*?z=)?video(?P<videoid>[^s].*?)(?:\?|%2F|$))'
+ IE_NAME = 'vk'
+ IE_DESC = 'VK'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
+ (?:
+ (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
+ (?:www\.)?biqle\.ru/watch/
+ )
+ (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+ )
+ '''
_NETRC_MACHINE = 'vk'
_TESTS = [
@@ -31,9 +44,10 @@ class VKIE(InfoExtractor):
'id': '162222515',
'ext': 'flv',
'title': 'ProtivoGunz - Хуёвая песня',
- 'uploader': 're:Noize MC.*',
+ 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'duration': 195,
'upload_date': '20120212',
+ 'view_count': int,
},
},
{
@@ -45,7 +59,8 @@ class VKIE(InfoExtractor):
'uploader': 'Tom Cruise',
'title': 'No name',
'duration': 9,
- 'upload_date': '20130721'
+ 'upload_date': '20130721',
+ 'view_count': int,
}
},
{
@@ -59,6 +74,7 @@ class VKIE(InfoExtractor):
'title': 'Lin Dan',
'duration': 101,
'upload_date': '20120730',
+ 'view_count': int,
}
},
{
@@ -73,7 +89,8 @@ class VKIE(InfoExtractor):
'uploader': 'Триллеры',
'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
'duration': 8352,
- 'upload_date': '20121218'
+ 'upload_date': '20121218',
+ 'view_count': int,
},
'skip': 'Requires vk account credentials',
},
@@ -100,14 +117,54 @@ class VKIE(InfoExtractor):
'title': 'Книга Илая',
'duration': 6771,
'upload_date': '20140626',
+ 'view_count': int,
},
'skip': 'Only works from Russia',
},
{
+ # video (removed?) only available with list id
+ 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
+ 'md5': '091287af5402239a1051c37ec7b92913',
+ 'info_dict': {
+ 'id': '171201961',
+ 'ext': 'mp4',
+ 'title': 'ТюменцевВВ_09.07.2015',
+ 'uploader': 'Anton Ivanov',
+ 'duration': 109,
+ 'upload_date': '20150709',
+ 'view_count': int,
+ },
+ },
+ {
+ # youtube embed
+ 'url': 'https://vk.com/video276849682_170681728',
+ 'info_dict': {
+ 'id': 'V3K4mi0SYkc',
+ 'ext': 'mp4',
+ 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
+ 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+ 'duration': 179,
+ 'upload_date': '20130116',
+ 'uploader': "Children's Joy Foundation",
+ 'uploader_id': 'thecjf',
+ 'view_count': int,
+ },
+ },
+ {
# removed video, just testing that we match the pattern
'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
'only_matching': True,
},
+ {
+ # age restricted video, requires vk account credentials
+ 'url': 'https://vk.com/video205387401_164765225',
+ 'only_matching': True,
+ },
+ {
+ # vk wrapper
+ 'url': 'http://www.biqle.ru/watch/847655_160197695',
+ 'only_matching': True,
+ }
]
def _login(self):
@@ -115,20 +172,25 @@ class VKIE(InfoExtractor):
if username is None:
return
- login_form = {
- 'act': 'login',
- 'role': 'al_frame',
- 'expire': '1',
- 'email': username,
- 'pass': password,
- }
+ login_page = self._download_webpage(
+ 'https://vk.com', None, 'Downloading login page')
+
+ login_form = self._hidden_inputs(login_page)
- request = compat_urllib_request.Request('https://login.vk.com/?act=login',
- compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+ login_form.update({
+ 'email': username.encode('cp1251'),
+ 'pass': password.encode('cp1251'),
+ })
+
+ request = sanitized_Request(
+ 'https://login.vk.com/?act=login',
+ compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ login_page = self._download_webpage(
+ request, None, note='Logging in as %s' % username)
if re.search(r'onLoginFailed', login_page):
- raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
+ raise ExtractorError(
+ 'Unable to login, incorrect username and/or password', expected=True)
def _real_initialize(self):
self._login()
@@ -140,9 +202,26 @@ class VKIE(InfoExtractor):
if not video_id:
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
- info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
+ info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+
+ # Some videos (removed?) can only be downloaded with list id specified
+ list_id = mobj.group('list_id')
+ if list_id:
+ info_url += '&list=%s' % list_id
+
info_page = self._download_webpage(info_url, video_id)
+ error_message = self._html_search_regex(
+ r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+ info_page, 'error message', default=None)
+ if error_message:
+ raise ExtractorError(error_message, expected=True)
+
+ if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
+ raise ExtractorError(
+ 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
+ expected=True)
+
ERRORS = {
r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
'Video %s has been removed from public access due to rightholder complaint.',
@@ -152,17 +231,28 @@ class VKIE(InfoExtractor):
'use --username and --password options to provide account credentials.',
r'<!>Unknown error':
- 'Video %s does not exist.'
+ 'Video %s does not exist.',
+
+ r'<!>Видео временно недоступно':
+ 'Video %s is temporarily unavailable.',
+
+ r'<!>Access denied':
+ 'Access denied to video %s.',
}
for error_re, error_msg in ERRORS.items():
if re.search(error_re, info_page):
raise ExtractorError(error_msg % video_id, expected=True)
- m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
- if m_yt is not None:
- self.to_screen('Youtube video detected')
- return self.url_result(m_yt.group(1), 'Youtube')
+ youtube_url = self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+ info_page, 'youtube iframe', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube')
+
+ vimeo_url = VimeoIE._extract_vimeo_url(url, info_page)
+ if vimeo_url is not None:
+ return self.url_result(vimeo_url)
m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page)
@@ -172,25 +262,33 @@ class VKIE(InfoExtractor):
m_rutube.group(1).replace('\\', ''))
return self.url_result(rutube_url)
- m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)
+ m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts:
- m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1))
+ m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
if m_opts_url:
opts_url = m_opts_url.group(1)
if opts_url.startswith('//'):
opts_url = 'http:' + opts_url
return self.url_result(opts_url)
- data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
+ data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars')
data = json.loads(data_json)
# Extract upload date
upload_date = None
- mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
+ mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
if mobj is not None:
mobj.group(1) + ' ' + mobj.group(2)
upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
+ view_count = None
+ views = self._html_search_regex(
+ r'"mv_views_count_number"[^>]*>(.+?\bviews?)<',
+ info_page, 'view count', fatal=False)
+ if views:
+ view_count = str_to_int(self._search_regex(
+ r'([\d,.]+)', views, 'view count', fatal=False))
+
formats = [{
'format_id': k,
'url': v,
@@ -207,26 +305,39 @@ class VKIE(InfoExtractor):
'uploader': data.get('md_author'),
'duration': data.get('duration'),
'upload_date': upload_date,
+ 'view_count': view_count,
}
class VKUserVideosIE(InfoExtractor):
- IE_NAME = 'vk.com:user-videos'
- IE_DESC = 'vk.com:All of a user\'s videos'
- _VALID_URL = r'https?://vk\.com/videos(?P<id>[0-9]+)(?:m\?.*)?'
+ IE_NAME = 'vk:uservideos'
+ IE_DESC = "VK - User's Videos"
+ _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$'
_TEMPLATE_URL = 'https://vk.com/videos'
- _TEST = {
+ _TESTS = [{
'url': 'http://vk.com/videos205387401',
+ 'info_dict': {
+ 'id': '205387401',
+ 'title': "Tom Cruise's Videos",
+ },
'playlist_mincount': 4,
- }
+ }, {
+ 'url': 'http://vk.com/videos-77521',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
page_id = self._match_id(url)
- page = self._download_webpage(url, page_id)
- video_ids = orderedSet(
- m.group(1) for m in re.finditer(r'href="/video([0-9_]+)"', page))
- url_entries = [
+
+ webpage = self._download_webpage(url, page_id)
+
+ entries = [
self.url_result(
'http://vk.com/video' + video_id, 'VK', video_id=video_id)
- for video_id in video_ids]
- return self.playlist_result(url_entries, page_id)
+ for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))]
+
+ title = unescapeHTML(self._search_regex(
+ r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
+ webpage, 'title', default=page_id))
+
+ return self.playlist_result(entries, page_id, title)
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
new file mode 100644
index 000000000..86c1cb5ef
--- /dev/null
+++ b/youtube_dl/extractor/vlive.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hmac
+from hashlib import sha1
+from base64 import b64encode
+from time import time
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ determine_ext
+)
+from ..compat import compat_urllib_parse
+
+
+class VLiveIE(InfoExtractor):
+ IE_NAME = 'vlive'
+ # www.vlive.tv/video/ links redirect to m.vlive.tv/video/ for mobile devices
+ _VALID_URL = r'https?://(?:(www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://m.vlive.tv/video/1326',
+ 'md5': 'cc7314812855ce56de70a06a27314983',
+ 'info_dict': {
+ 'id': '1326',
+ 'ext': 'mp4',
+ 'title': '[V] Girl\'s Day\'s Broadcast',
+ 'creator': 'Girl\'s Day',
+ },
+ }
+ _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'http://m.vlive.tv/video/%s' % video_id,
+ video_id, note='Download video page')
+
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ creator = self._html_search_regex(
+ r'<span[^>]+class="name">([^<>]+)</span>', webpage, 'creator')
+
+ url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id
+ msgpad = '%.0f' % (time() * 1000)
+ md = b64encode(
+ hmac.new(self._SECRET.encode('ascii'),
+ (url[:255] + msgpad).encode('ascii'), sha1).digest()
+ )
+ url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md})
+ playinfo = self._download_json(url, video_id, 'Downloading video json')
+
+ if playinfo.get('message', '') != 'success':
+ raise ExtractorError(playinfo.get('message', 'JSON request unsuccessful'))
+
+ if not playinfo.get('result'):
+ raise ExtractorError('No videos found.')
+
+ formats = []
+ for vid in playinfo['result'].get('videos', {}).get('list', []):
+ formats.append({
+ 'url': vid['source'],
+ 'ext': 'mp4',
+ 'abr': vid.get('bitrate', {}).get('audio'),
+ 'vbr': vid.get('bitrate', {}).get('video'),
+ 'format_id': vid['encodingOption']['name'],
+ 'height': vid.get('height'),
+ 'width': vid.get('width'),
+ })
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for caption in playinfo['result'].get('captions', {}).get('list', []):
+ subtitles[caption['language']] = [
+ {'ext': determine_ext(caption['source'], default_ext='vtt'),
+ 'url': caption['source']}]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'creator': creator,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py
index 1c0966a79..be0a2780f 100644
--- a/youtube_dl/extractor/vodlocker.py
+++ b/youtube_dl/extractor/vodlocker.py
@@ -1,13 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
+from ..utils import sanitized_Request
class VodlockerIE(InfoExtractor):
@@ -28,17 +24,12 @@ class VodlockerIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- (?:id="[^"]+"\s+)?
- value="([^"]*)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
if fields['op'] == 'download1':
self._sleep(3, video_id) # they do detect when requests happen too fast!
post = compat_urllib_parse.urlencode(fields)
- req = compat_urllib_request.Request(url, post)
+ req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
webpage = self._download_webpage(
req, video_id, 'Downloading video page')
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py
new file mode 100644
index 000000000..93d15a556
--- /dev/null
+++ b/youtube_dl/extractor/voicerepublic.py
@@ -0,0 +1,97 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ int_or_none,
+ sanitized_Request,
+)
+
+
+class VoiceRepublicIE(InfoExtractor):
+ _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
+ _TESTS = [{
+ 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
+ 'md5': '0554a24d1657915aa8e8f84e15dc9353',
+ 'info_dict': {
+ 'id': '2296',
+ 'display_id': 'watching-the-watchers-building-a-sousveillance-state',
+ 'ext': 'm4a',
+ 'title': 'Watching the Watchers: Building a Sousveillance State',
+ 'description': 'md5:715ba964958afa2398df615809cfecb1',
+ 'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
+ 'duration': 1800,
+ 'view_count': int,
+ }
+ }, {
+ 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ req = sanitized_Request(
+ compat_urlparse.urljoin(url, '/talks/%s' % display_id))
+ # Older versions of Firefox get redirected to an "upgrade browser" page
+ req.add_header('User-Agent', 'youtube-dl')
+ webpage = self._download_webpage(req, display_id)
+
+ if '>Queued for processing, please stand by...<' in webpage:
+ raise ExtractorError(
+ 'Audio is still queued for processing', expected=True)
+
+ config = self._search_regex(
+ r'(?s)return ({.+?});\s*\n', webpage,
+ 'data', default=None)
+ data = self._parse_json(config, display_id, fatal=False) if config else None
+ if data:
+ title = data['title']
+ description = data.get('teaser')
+ talk_id = data.get('talk_id') or display_id
+ talk = data['talk']
+ duration = int_or_none(talk.get('duration'))
+ formats = [{
+ 'url': compat_urlparse.urljoin(url, talk_url),
+ 'format_id': format_id,
+ 'ext': determine_ext(talk_url) or format_id,
+ 'vcodec': 'none',
+ } for format_id, talk_url in talk['links'].items()]
+ else:
+ title = self._og_search_title(webpage)
+ description = self._html_search_regex(
+ r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>",
+ webpage, 'description', fatal=False)
+ talk_id = self._search_regex(
+ [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"],
+ webpage, 'talk id', default=None) or display_id
+ duration = None
+ player = self._search_regex(
+ r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player')
+ formats = [{
+ 'url': compat_urlparse.urljoin(url, talk_url),
+ 'format_id': format_id,
+ 'ext': determine_ext(talk_url) or format_id,
+ 'vcodec': 'none',
+ } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)]
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+ view_count = int_or_none(self._search_regex(
+ r"class='play-count[^']*'>\s*(\d+) plays",
+ webpage, 'play count', fatal=False))
+
+ return {
+ 'id': talk_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py
index 2d23effcc..92c90e517 100644
--- a/youtube_dl/extractor/vporn.py
+++ b/youtube_dl/extractor/vporn.py
@@ -27,9 +27,6 @@ class VpornIE(InfoExtractor):
'duration': 393,
'age_limit': 18,
'view_count': int,
- 'like_count': int,
- 'dislike_count': int,
- 'comment_count': int,
}
},
{
@@ -47,9 +44,6 @@ class VpornIE(InfoExtractor):
'duration': 588,
'age_limit': 18,
'view_count': int,
- 'like_count': int,
- 'dislike_count': int,
- 'comment_count': int,
}
},
]
@@ -64,29 +58,29 @@ class VpornIE(InfoExtractor):
title = self._html_search_regex(
r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip()
description = self._html_search_regex(
- r'<div class="description_txt">(.*?)</div>', webpage, 'description', fatal=False)
+ r'class="(?:descr|description_txt)">(.*?)</div>',
+ webpage, 'description', fatal=False)
thumbnail = self._html_search_regex(
r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None)
if thumbnail:
thumbnail = 'http://www.vporn.com' + thumbnail
uploader = self._html_search_regex(
- r'(?s)UPLOADED BY.*?<a href="/user/[^"]+">([^<]+)</a>',
+ r'(?s)Uploaded by:.*?<a href="/user/[^"]+"[^>]*>(.+?)</a>',
webpage, 'uploader', fatal=False)
- categories = re.findall(r'<a href="/cat/[^"]+">([^<]+)</a>', webpage)
+ categories = re.findall(r'<a href="/cat/[^"]+"[^>]*>([^<]+)</a>', webpage)
duration = parse_duration(self._search_regex(
- r'duration (\d+ min \d+ sec)', webpage, 'duration', fatal=False))
+ r'Runtime:\s*</span>\s*(\d+ min \d+ sec)',
+ webpage, 'duration', fatal=False))
- view_count = str_to_int(self._html_search_regex(
- r'<span>([\d,\.]+) VIEWS</span>', webpage, 'view count', fatal=False))
- like_count = str_to_int(self._html_search_regex(
- r'<span id="like" class="n">([\d,\.]+)</span>', webpage, 'like count', fatal=False))
- dislike_count = str_to_int(self._html_search_regex(
- r'<span id="dislike" class="n">([\d,\.]+)</span>', webpage, 'dislike count', fatal=False))
+ view_count = str_to_int(self._search_regex(
+ r'class="views">([\d,\.]+) [Vv]iews<',
+ webpage, 'view count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
- r'<h4>Comments \(<b>([\d,\.]+)</b>\)</h4>', webpage, 'comment count', fatal=False))
+ r"'Comments \(([\d,\.]+)\)'",
+ webpage, 'comment count', default=None))
formats = []
@@ -117,8 +111,6 @@ class VpornIE(InfoExtractor):
'categories': categories,
'duration': duration,
'view_count': view_count,
- 'like_count': like_count,
- 'dislike_count': dislike_count,
'comment_count': comment_count,
'age_limit': 18,
'formats': formats,
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
index 405cb9db4..149e36467 100644
--- a/youtube_dl/extractor/vube.py
+++ b/youtube_dl/extractor/vube.py
@@ -36,6 +36,7 @@ class VubeIE(InfoExtractor):
'comment_count': int,
'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'],
},
+ 'skip': 'Not accessible from Travis CI server',
}, {
'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
'md5': 'db7aba89d4603dadd627e9d1973946fe',
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
index c3fde53f5..a6d9b5fee 100644
--- a/youtube_dl/extractor/vuclip.py
+++ b/youtube_dl/extractor/vuclip.py
@@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor):
links_code = self._search_regex(
r'''(?xs)
(?:
- <img\s+src="/im/play.gif".*?>|
+ <img\s+src="[^"]*/play.gif".*?>|
<!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
)
(.*?)
diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py
index 1eb24a3d6..faa167e65 100644
--- a/youtube_dl/extractor/vulture.py
+++ b/youtube_dl/extractor/vulture.py
@@ -44,7 +44,7 @@ class VultureIE(InfoExtractor):
query_webpage = self._download_webpage(
query_url, display_id, note='Downloading query page')
params_json = self._search_regex(
- r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n',
+ r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',
query_webpage,
'player params')
params = json.loads(params_json)
diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py
index 672bda7a7..24efbd6e6 100644
--- a/youtube_dl/extractor/walla.py
+++ b/youtube_dl/extractor/walla.py
@@ -3,14 +3,14 @@ from __future__ import unicode_literals
import re
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
from ..utils import (
xpath_text,
int_or_none,
)
-class WallaIE(SubtitlesInfoExtractor):
+class WallaIE(InfoExtractor):
_VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'
_TEST = {
'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one',
@@ -52,13 +52,10 @@ class WallaIE(SubtitlesInfoExtractor):
subtitles = {}
for subtitle in item.findall('./subtitles/subtitle'):
lang = xpath_text(subtitle, './title')
- subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = xpath_text(subtitle, './src')
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, subtitles)
- return
-
- subtitles = self.extract_subtitles(video_id, subtitles)
+ subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
+ 'ext': 'srt',
+ 'url': xpath_text(subtitle, './src'),
+ }]
formats = []
for quality in item.findall('./qualities/quality'):
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index 72eb010f8..ec8b99998 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -19,25 +19,25 @@ class WashingtonPostIE(InfoExtractor):
'title': 'Sinkhole of bureaucracy',
},
'playlist': [{
- 'md5': '79132cc09ec5309fa590ae46e4cc31bc',
+ 'md5': 'b9be794ceb56c7267d410a13f99d801a',
'info_dict': {
'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
'title': 'Breaking Points: The Paper Mine',
- 'duration': 1287,
+ 'duration': 1290,
'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
'uploader': 'The Washington Post',
'timestamp': 1395527908,
'upload_date': '20140322',
},
}, {
- 'md5': 'e1d5734c06865cc504ad99dc2de0d443',
+ 'md5': '1fff6a689d8770966df78c8cb6c8c17c',
'info_dict': {
'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
'ext': 'mp4',
'title': 'The town bureaucracy sustains',
'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
- 'duration': 2217,
+ 'duration': 2220,
'timestamp': 1395528005,
'upload_date': '20140322',
'uploader': 'The Washington Post',
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index bf9e40bad..affcc52f6 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -113,7 +113,7 @@ class WatIE(InfoExtractor):
video_url = self._download_webpage(
'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country),
real_id,
- 'Downloding %s video URL' % fmt[0],
+ 'Downloading %s video URL' % fmt[0],
'Failed to download %s video URL' % fmt[0],
False)
if not video_url:
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index c90488500..b46802306 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -28,6 +28,7 @@ class WDRIE(InfoExtractor):
'title': 'Servicezeit',
'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb',
'upload_date': '20140310',
+ 'is_live': False
},
'params': {
'skip_download': True,
@@ -41,6 +42,7 @@ class WDRIE(InfoExtractor):
'title': 'Marga Spiegel ist tot',
'description': 'md5:2309992a6716c347891c045be50992e4',
'upload_date': '20140311',
+ 'is_live': False
},
'params': {
'skip_download': True,
@@ -55,6 +57,7 @@ class WDRIE(InfoExtractor):
'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)',
'description': 'md5:2309992a6716c347891c045be50992e4',
'upload_date': '20091129',
+ 'is_live': False
},
},
{
@@ -66,6 +69,7 @@ class WDRIE(InfoExtractor):
'title': 'Flavia Coelho: Amar é Amar',
'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
'upload_date': '20140717',
+ 'is_live': False
},
},
{
@@ -74,6 +78,20 @@ class WDRIE(InfoExtractor):
'info_dict': {
'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100',
}
+ },
+ {
+ 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html',
+ 'info_dict': {
+ 'id': 'mdb-103364',
+ 'title': 're:^WDR Fernsehen [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
+ 'ext': 'flv',
+ 'upload_date': '20150212',
+ 'is_live': True
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}
]
@@ -119,6 +137,10 @@ class WDRIE(InfoExtractor):
video_url = flashvars['dslSrc'][0]
title = flashvars['trackerClipTitle'][0]
thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
+ is_live = flashvars.get('isLive', ['0'])[0] == '1'
+
+ if is_live:
+ title = self._live_title(title)
if 'trackerClipAirTime' in flashvars:
upload_date = flashvars['trackerClipAirTime'][0]
@@ -131,6 +153,13 @@ class WDRIE(InfoExtractor):
if video_url.endswith('.f4m'):
video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
ext = 'flv'
+ elif video_url.endswith('.smil'):
+ fmt = self._extract_smil_formats(video_url, page_id)[0]
+ video_url = fmt['url']
+ sep = '&' if '?' in video_url else '?'
+ video_url += sep
+ video_url += 'hdcore=3.3.0&plugin=aasp-3.3.0.99.43'
+ ext = fmt['ext']
else:
ext = determine_ext(video_url)
@@ -144,6 +173,7 @@ class WDRIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'upload_date': upload_date,
+ 'is_live': is_live
}
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
index 396cf4e83..2037d9b3d 100644
--- a/youtube_dl/extractor/webofstories.py
+++ b/youtube_dl/extractor/webofstories.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import int_or_none
@@ -45,19 +47,17 @@ class WebOfStoriesIE(InfoExtractor):
description = self._html_search_meta('description', webpage)
thumbnail = self._og_search_thumbnail(webpage)
- story_filename = self._search_regex(
- r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename')
- speaker_id = self._search_regex(
- r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID')
- story_id = self._search_regex(
- r'\.storyId\((\d+)\)', webpage, 'story ID')
- speaker_type = self._search_regex(
- r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type')
- great_life = self._search_regex(
- r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story')
+ embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
+ r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
+ webpage, 'embed params').split(',')]
+
+ (
+ _, speaker_id, story_id, story_duration,
+ speaker_type, great_life, _thumbnail, _has_subtitles,
+ story_filename, _story_order) = embed_params
+
is_great_life_series = great_life == 'true'
- duration = int_or_none(self._search_regex(
- r'\.duration\((\d+)\)', webpage, 'duration', fatal=False))
+ duration = int_or_none(story_duration)
# URL building, see: http://www.webofstories.com/scripts/player.js
ms_prefix = ''
@@ -100,3 +100,42 @@ class WebOfStoriesIE(InfoExtractor):
'description': description,
'duration': duration,
}
+
+
+class WebOfStoriesPlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://www.webofstories.com/playAll/donald.knuth',
+ 'info_dict': {
+ 'id': 'donald.knuth',
+ 'title': 'Donald Knuth (Scientist)',
+ },
+ 'playlist_mincount': 97,
+ }
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories')
+ for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage))
+ ]
+
+ title = self._search_regex(
+ r'<div id="speakerName">\s*<span>([^<]+)</span>',
+ webpage, 'speaker', default=None)
+ if title:
+ field = self._search_regex(
+ r'<span id="primaryField">([^<]+)</span>',
+ webpage, 'field', default=None)
+ if field:
+ title += ' (%s)' % field
+
+ if not title:
+ title = self._search_regex(
+ r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
+ webpage, 'title')
+
+ return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index d6dec25ca..e4f50e64c 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -1,43 +1,37 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from .youtube import YoutubeIE
class WimpIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
+ _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)/'
_TESTS = [{
'url': 'http://www.wimp.com/maruexhausted/',
- 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
+ 'md5': 'ee21217ffd66d058e8b16be340b74883',
'info_dict': {
'id': 'maruexhausted',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Maru is exhausted.',
'description': 'md5:57e099e857c0a4ea312542b684a869b8',
}
}, {
- # youtube video
'url': 'http://www.wimp.com/clowncar/',
+ 'md5': '4e2986c793694b55b37cf92521d12bb4',
'info_dict': {
- 'id': 'cG4CEr2aiSg',
+ 'id': 'clowncar',
'ext': 'mp4',
- 'title': 'Basset hound clown car...incredible!',
- 'description': 'md5:8d228485e0719898c017203f900b3a35',
- 'uploader': 'Gretchen Hoey',
- 'uploader_id': 'gretchenandjeff1',
- 'upload_date': '20140303',
+ 'title': 'It\'s like a clown car.',
+ 'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2',
},
- 'add_ie': ['Youtube'],
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL')
+ [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"],
+ webpage, 'video URL')
if YoutubeIE.suitable(video_url):
self.to_screen('Found YouTube video')
return {
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index 13a079151..fdb16d91c 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -1,8 +1,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_request
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ sanitized_Request,
+)
class WistiaIE(InfoExtractor):
@@ -23,7 +25,7 @@ class WistiaIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- request = compat_urllib_request.Request(self._API_URL.format(video_id))
+ request = sanitized_Request(self._API_URL.format(video_id))
request.add_header('Referer', url) # Some videos require this.
data_json = self._download_json(request, video_id)
if data_json.get('error'):
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index d5c26a032..a3ea26feb 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -6,8 +6,8 @@ from .common import InfoExtractor
class WorldStarHipHopIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)'
+ _TESTS = [{
"url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
"md5": "9d04de741161603bf7071bbf4e883186",
"info_dict": {
@@ -15,7 +15,15 @@ class WorldStarHipHopIE(InfoExtractor):
"ext": "mp4",
"title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
- }
+ }, {
+ 'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
+ 'md5': 'dc1c76c83ecc4190bb1eb143899b87d3',
+ 'info_dict': {
+ 'id': 'wshh6a7q1ny0G34ZwuIO',
+ 'ext': 'mp4',
+ "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -26,19 +34,22 @@ class WorldStarHipHopIE(InfoExtractor):
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
video_url = self._search_regex(
- r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL')
+ [r'so\.addVariable\("file","(.*?)"\)',
+ r'<div class="artlist">\s*<a[^>]+href="([^"]+)">'],
+ webpage, 'video URL')
if 'youtube' in video_url:
return self.url_result(video_url, ie='Youtube')
video_title = self._html_search_regex(
- r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+ [r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+ r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'],
webpage, 'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
thumbnail = self._html_search_regex(
r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
- fatal=False)
+ default=None)
if not thumbnail:
_title = r'candytitles.*>(.*)</span>'
mobj = re.search(_title, webpage)
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index cbe3dc7be..5a897371d 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -18,8 +18,8 @@ class WSJIE(InfoExtractor):
'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'ext': 'mp4',
'upload_date': '20150202',
- 'uploader_id': 'bbright',
- 'creator': 'bbright',
+ 'uploader_id': 'jdesai',
+ 'creator': 'jdesai',
'categories': list, # a long list
'duration': 90,
'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
@@ -84,6 +84,5 @@ class WSJIE(InfoExtractor):
'duration': duration,
'upload_date': upload_date,
'title': title,
- 'formats': formats,
'categories': categories,
}
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
index 80c48c37d..4ff99e5ca 100644
--- a/youtube_dl/extractor/xbef.py
+++ b/youtube_dl/extractor/xbef.py
@@ -1,9 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
class XBefIE(InfoExtractor):
@@ -30,7 +28,7 @@ class XBefIE(InfoExtractor):
config_url_enc = self._download_webpage(
'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
note='Retrieving config URL')
- config_url = compat_urllib_parse.unquote(config_url_enc)
+ config_url = compat_urllib_parse_unquote(config_url_enc)
config = self._download_xml(
config_url, video_id, note='Retrieving config')
diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/xfileshare.py
index ae24aff84..a3236e66c 100644
--- a/youtube_dl/extractor/gorillavid.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -1,24 +1,23 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urllib_request,
-)
+from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
+ encode_dict,
int_or_none,
+ sanitized_Request,
)
-class GorillaVidIE(InfoExtractor):
- IE_DESC = 'GorillaVid.in, daclips.in, movpod.in and fastvideo.in'
+class XFileShareIE(InfoExtractor):
+ IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
_VALID_URL = r'''(?x)
https?://(?P<host>(?:www\.)?
- (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in))/
+ (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/
(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
'''
@@ -35,13 +34,7 @@ class GorillaVidIE(InfoExtractor):
},
}, {
'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html',
- 'md5': 'c9e293ca74d46cad638e199c3f3fe604',
- 'info_dict': {
- 'id': 'z08zf8le23c6',
- 'ext': 'mp4',
- 'title': 'Say something nice',
- 'thumbnail': 're:http://.*\.jpg',
- },
+ 'only_matching': True,
}, {
'url': 'http://daclips.in/3rso4kdn6f9m',
'md5': '1ad8fd39bb976eeb66004d3a4895f106',
@@ -62,25 +55,45 @@ class GorillaVidIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
},
}, {
+ 'url': 'http://realvid.net/ctn2y6p2eviw',
+ 'md5': 'b2166d2cf192efd6b6d764c18fd3710e',
+ 'info_dict': {
+ 'id': 'ctn2y6p2eviw',
+ 'ext': 'flv',
+ 'title': 'rdx 1955',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }, {
'url': 'http://movpod.in/0wguyyxi1yca',
'only_matching': True,
+ }, {
+ 'url': 'http://filehoot.com/3ivfabn7573c.html',
+ 'info_dict': {
+ 'id': '3ivfabn7573c',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4',
+ 'thumbnail': 're:http://.*\.jpg',
+ }
+ }, {
+ 'url': 'http://vidto.me/ku5glz52nqe1.html',
+ 'info_dict': {
+ 'id': 'ku5glz52nqe1',
+ 'ext': 'mp4',
+ 'title': 'test'
+ }
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- webpage = self._download_webpage('http://%s/%s' % (mobj.group('host'), video_id), video_id)
+ url = 'http://%s/%s' % (mobj.group('host'), video_id)
+ webpage = self._download_webpage(url, video_id)
if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- fields = dict(re.findall(r'''(?x)<input\s+
- type="hidden"\s+
- name="([^"]+)"\s+
- (?:id="[^"]+"\s+)?
- value="([^"]*)"
- ''', webpage))
+ fields = self._hidden_inputs(webpage)
if fields['op'] == 'download1':
countdown = int_or_none(self._search_regex(
@@ -89,20 +102,25 @@ class GorillaVidIE(InfoExtractor):
if countdown:
self._sleep(countdown, video_id)
- post = compat_urllib_parse.urlencode(fields)
+ post = compat_urllib_parse.urlencode(encode_dict(fields))
- req = compat_urllib_request.Request(url, post)
+ req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
webpage = self._download_webpage(req, video_id, 'Downloading video page')
- title = self._search_regex(
- r'style="z-index: [0-9]+;">([^<]+)</span>',
- webpage, 'title', default=None) or self._og_search_title(webpage)
+ title = (self._search_regex(
+ [r'style="z-index: [0-9]+;">([^<]+)</span>',
+ r'<td nowrap>([^<]+)</td>',
+ r'>Watch (.+) ',
+ r'<h2 class="video-page-head">([^<]+)</h2>'],
+ webpage, 'title', default=None) or self._og_search_title(webpage)).strip()
video_url = self._search_regex(
- r'file\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'file url')
+ [r'file\s*:\s*["\'](http[^"\']+)["\'],',
+ r'file_link\s*=\s*\'(https?:\/\/[0-9a-zA-z.\/\-_]+)'],
+ webpage, 'file url')
thumbnail = self._search_regex(
- r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', fatal=False)
+ r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None)
formats = [{
'format_id': 'sd',
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 4527567f8..8938c0e45 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -4,7 +4,6 @@ import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
unified_strdate,
str_to_int,
int_or_none,
@@ -13,7 +12,6 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
- """Information Extractor for xHamster"""
_VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
_TESTS = [
{
@@ -23,7 +21,7 @@ class XHamsterIE(InfoExtractor):
'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014',
- 'uploader_id': 'Ruseful2011',
+ 'uploader': 'Ruseful2011',
'duration': 893,
'age_limit': 18,
}
@@ -35,7 +33,7 @@ class XHamsterIE(InfoExtractor):
'ext': 'mp4',
'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914',
- 'uploader_id': 'jojo747400',
+ 'uploader': 'jojo747400',
'duration': 200,
'age_limit': 18,
}
@@ -47,12 +45,12 @@ class XHamsterIE(InfoExtractor):
]
def _real_extract(self, url):
- def extract_video_url(webpage):
- mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
- if mp4 is None:
- raise ExtractorError('Unable to extract media URL')
- else:
- return mp4.group(1)
+ def extract_video_url(webpage, name):
+ return self._search_regex(
+ [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
+ r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
+ r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
+ webpage, name, group='mp4')
def is_hd(webpage):
return '<div class=\'icon iconHD\'' in webpage
@@ -65,7 +63,9 @@ class XHamsterIE(InfoExtractor):
mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
- title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
+ title = self._html_search_regex(
+ [r'<title>(?P<title>.+?)(?:, (?:[^,]+? )?Porn: xHamster| - xHamster\.com)</title>',
+ r'<h1>([^<]+)</h1>'], webpage, 'title')
# Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
@@ -76,10 +76,14 @@ class XHamsterIE(InfoExtractor):
if upload_date:
upload_date = unified_strdate(upload_date)
- uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
- webpage, 'uploader id', default='anonymous')
+ uploader = self._html_search_regex(
+ r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)",
+ webpage, 'uploader', default='anonymous')
- thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
+ thumbnail = self._search_regex(
+ [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''',
+ r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''],
+ webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
webpage, 'duration', fatal=False))
@@ -98,7 +102,9 @@ class XHamsterIE(InfoExtractor):
hd = is_hd(webpage)
- video_url = extract_video_url(webpage)
+ format_id = 'hd' if hd else 'sd'
+
+ video_url = extract_video_url(webpage, format_id)
formats = [{
'url': video_url,
'format_id': 'hd' if hd else 'sd',
@@ -109,7 +115,7 @@ class XHamsterIE(InfoExtractor):
mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
- video_url = extract_video_url(webpage)
+ video_url = extract_video_url(webpage, 'hd')
formats.append({
'url': video_url,
'format_id': 'hd',
@@ -123,7 +129,7 @@ class XHamsterIE(InfoExtractor):
'title': title,
'description': description,
'upload_date': upload_date,
- 'uploader_id': uploader_id,
+ 'uploader': uploader,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
@@ -133,3 +139,36 @@ class XHamsterIE(InfoExtractor):
'age_limit': age_limit,
'formats': formats,
}
+
+
+class XHamsterEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://xhamster.com/xembed.php?video=3328539',
+ 'info_dict': {
+ 'id': '3328539',
+ 'ext': 'mp4',
+ 'title': 'Pen Masturbation',
+ 'upload_date': '20140728',
+ 'uploader_id': 'anonymous',
+ 'duration': 5,
+ 'age_limit': 18,
+ }
+ }
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+ webpage, 'xhamster url')
+
+ return self.url_result(video_url, 'XHamster')
diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py
index 8c6241aed..7c9d8af6f 100644
--- a/youtube_dl/extractor/xminus.py
+++ b/youtube_dl/extractor/xminus.py
@@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor):
r'minus_track\.dur_sec=\'([0-9]*?)\'',
webpage, 'duration', fatal=False))
filesize_approx = parse_filesize(self._html_search_regex(
- r'<div class="filesize[^"]*"></div>\s*([0-9.]+\s*[a-zA-Z][bB])',
+ r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',
webpage, 'approximate filesize', fatal=False))
tbr = int_or_none(self._html_search_regex(
r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
@@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor):
description = re.sub(' *\r *', '\n', description)
enc_token = self._html_search_regex(
- r'minus_track\.tkn="(.+?)"', webpage, 'enc_token')
+ r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')
token = ''.join(
c if pos == 3 else compat_chr(compat_ord(c) - 1)
for pos, c in enumerate(reversed(enc_token)))
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 79ed6c744..5a41f8ffa 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -2,9 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
class XNXXIE(InfoExtractor):
@@ -26,7 +24,7 @@ class XNXXIE(InfoExtractor):
video_url = self._search_regex(r'flv_url=(.*?)&amp;',
webpage, 'video URL')
- video_url = compat_urllib_parse.unquote(video_url)
+ video_url = compat_urllib_parse_unquote(video_url)
video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
webpage, 'title')
diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py
new file mode 100644
index 000000000..71584c291
--- /dev/null
+++ b/youtube_dl/extractor/xstream.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ xpath_with_ns,
+ xpath_text,
+ find_xpath_attr,
+)
+
+
+class XstreamIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ xstream:|
+ https?://frontend\.xstream\.(?:dk|net)/
+ )
+ (?P<partner_id>[^/]+)
+ (?:
+ :|
+ /feed/video/\?.*?\bid=
+ )
+ (?P<id>\d+)
+ '''
+ _TESTS = [{
+ 'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588',
+ 'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+ 'info_dict': {
+ 'id': '86588',
+ 'ext': 'mov',
+ 'title': 'Otto Wollertsen',
+ 'description': 'Vestlendingen Otto Fredrik Wollertsen',
+ 'timestamp': 1430473209,
+ 'upload_date': '20150501',
+ },
+ }, {
+ 'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ partner_id = mobj.group('partner_id')
+ video_id = mobj.group('id')
+
+ data = self._download_xml(
+ 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s'
+ % (partner_id, video_id),
+ video_id)
+
+ NS_MAP = {
+ 'atom': 'http://www.w3.org/2005/Atom',
+ 'xt': 'http://xstream.dk/',
+ 'media': 'http://search.yahoo.com/mrss/',
+ }
+
+ entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
+
+ title = xpath_text(
+ entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
+ description = xpath_text(
+ entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
+ timestamp = parse_iso8601(xpath_text(
+ entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
+
+ formats = []
+ media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
+ for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
+ media_url = media_content.get('url')
+ if not media_url:
+ continue
+ tbr = int_or_none(media_content.get('bitrate'))
+ mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
+ if mobj:
+ formats.append({
+ 'url': mobj.group('url'),
+ 'play_path': 'mp4:%s' % mobj.group('playpath'),
+ 'app': mobj.group('app'),
+ 'ext': 'flv',
+ 'tbr': tbr,
+ 'format_id': 'rtmp-%d' % tbr,
+ })
+ else:
+ formats.append({
+ 'url': media_url,
+ 'tbr': tbr,
+ })
+ self._sort_formats(formats)
+
+ link = find_xpath_attr(
+ entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
+ if link is not None:
+ formats.append({
+ 'url': link.get('href'),
+ 'format_id': link.get('rel'),
+ })
+
+ thumbnails = [{
+ 'url': splash.get('url'),
+ 'width': int_or_none(splash.get('width')),
+ 'height': int_or_none(splash.get('height')),
+ } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index e8490b028..a1fe24050 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -3,12 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_request,
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
parse_duration,
+ sanitized_Request,
str_to_int,
)
@@ -22,7 +20,7 @@ class XTubeIE(InfoExtractor):
'id': 'kVTUy_G222_',
'ext': 'mp4',
'title': 'strange erotica',
- 'description': 'http://www.xtube.com an ET kind of thing',
+ 'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers',
'duration': 450,
'age_limit': 18,
@@ -32,7 +30,7 @@ class XTubeIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- req = compat_urllib_request.Request(url)
+ req = sanitized_Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
@@ -59,7 +57,7 @@ class XTubeIE(InfoExtractor):
for format_id, video_url in re.findall(
r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage):
fmt = {
- 'url': compat_urllib_parse.unquote(video_url),
+ 'url': compat_urllib_parse_unquote(video_url),
'format_id': format_id,
}
m = re.search(r'^(?P<height>\d+)[pP]', format_id)
@@ -68,7 +66,7 @@ class XTubeIE(InfoExtractor):
formats.append(fmt)
if not formats:
- video_url = compat_urllib_parse.unquote(self._search_regex(
+ video_url = compat_urllib_parse_unquote(self._search_regex(
r'flashvars\.video_url\s*=\s*"([^"]+)"',
webpage, 'video URL'))
formats.append({'url': video_url})
diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py
index 4971965f9..8bbac54e2 100644
--- a/youtube_dl/extractor/xuite.py
+++ b/youtube_dl/extractor/xuite.py
@@ -13,12 +13,13 @@ from ..utils import (
class XuiteIE(InfoExtractor):
+ IE_DESC = '隨意窩Xuite影音'
_REGEX_BASE64 = r'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
_VALID_URL = r'https?://vlog\.xuite\.net/(?:play|embed)/(?P<id>%s)' % _REGEX_BASE64
_TESTS = [{
# Audio
'url': 'http://vlog.xuite.net/play/RGkzc1ZULTM4NjA5MTQuZmx2',
- 'md5': '63a42c705772aa53fd4c1a0027f86adf',
+ 'md5': 'e79284c87b371424885448d11f6398c8',
'info_dict': {
'id': '3860914',
'ext': 'mp3',
@@ -69,18 +70,26 @@ class XuiteIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def base64_decode_utf8(data):
+ return base64.b64decode(data.encode('utf-8')).decode('utf-8')
+
+ @staticmethod
+ def base64_encode_utf8(data):
+ return base64.b64encode(data.encode('utf-8')).decode('utf-8')
+
def _extract_flv_config(self, media_id):
- base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8')
+ base64_media_id = self.base64_encode_utf8(media_id)
flv_config = self._download_xml(
'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,
'flv config')
prop_dict = {}
for prop in flv_config.findall('./property'):
- prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8')
+ prop_id = self.base64_decode_utf8(prop.attrib['id'])
# CDATA may be empty in flv config
if not prop.text:
continue
- encoded_content = base64.b64decode(prop.text).decode('utf-8')
+ encoded_content = self.base64_decode_utf8(prop.text)
prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)
return prop_dict
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 2a45dc574..710ad5041 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -3,12 +3,12 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
clean_html,
ExtractorError,
+ determine_ext,
+ sanitized_Request,
)
@@ -25,6 +25,8 @@ class XVideosIE(InfoExtractor):
}
}
+ _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
+
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -33,16 +35,37 @@ class XVideosIE(InfoExtractor):
if mobj:
raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
- video_url = compat_urllib_parse.unquote(
+ video_url = compat_urllib_parse_unquote(
self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
video_title = self._html_search_regex(
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+ formats = [{
+ 'url': video_url,
+ }]
+
+ android_req = sanitized_Request(url)
+ android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
+ android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+
+ if android_webpage is not None:
+ player_params_str = self._search_regex(
+ 'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
+ android_webpage, 'player parameters', default='')
+ player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
+ if player_params:
+ formats.extend([{
+ 'url': param,
+ 'preference': -10,
+ } for param in player_params if determine_ext(param) == 'mp4'])
+
+ self._sort_formats(formats)
+
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index f8e7041a0..fca5ddc69 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -15,16 +15,18 @@ from ..utils import (
unescapeHTML,
ExtractorError,
int_or_none,
+ mimetype2ext,
)
+from .nbc import NBCSportsVPlayerIE
+
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen and movies'
- _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+?)-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
+ _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- 'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
'ext': 'mp4',
@@ -99,7 +101,7 @@ class YahooIE(InfoExtractor):
}
}, {
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
- 'md5': '67010fdf3a08d290e060a4dd96baa07b',
+ 'md5': '88e209b417f173d86186bef6e4d1f160',
'info_dict': {
'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
'ext': 'mp4',
@@ -130,12 +132,35 @@ class YahooIE(InfoExtractor):
}, {
'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
'only_matching': True,
+ }, {
+ 'note': 'NBC Sports embeds',
+ 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
+ 'info_dict': {
+ 'id': '9CsDKds0kvHI',
+ 'ext': 'flv',
+ 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+ 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+ }
+ }, {
+ 'url': 'https://tw.news.yahoo.com/-100120367.html',
+ 'only_matching': True,
+ }, {
+ # Query result is embedded in webpage, but explicit request to video API fails with geo restriction
+ 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
+ 'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
+ 'info_dict': {
+ 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
+ 'ext': 'mp4',
+ 'title': 'Communitary - Community Episode 1: Ladders',
+ 'description': 'md5:8fc39608213295748e1e289807838c97',
+ 'duration': 1646,
+ },
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
+ display_id = mobj.group('display_id') or self._match_id(url)
page_id = mobj.group('id')
url = mobj.group('url')
host = mobj.group('host')
@@ -152,6 +177,23 @@ class YahooIE(InfoExtractor):
items = json.loads(items_json)
video_id = items[0]['id']
return self._get_info(video_id, display_id, webpage)
+ # Look for NBCSports iframes
+ nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+ if nbc_sports_url:
+ return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+
+ # Query result is often embedded in webpage as JSON. Sometimes explicit requests
+ # to video API results in a failure with geo restriction reason therefore using
+ # embedded query result when present sounds reasonable.
+ config_json = self._search_regex(
+ r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)',
+ webpage, 'videoplayer applet', default=None)
+ if config_json:
+ config = self._parse_json(config_json, display_id, fatal=False)
+ if config:
+ sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi')
+ if sapi:
+ return self._extract_info(display_id, sapi, webpage)
items_json = self._search_regex(
r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
@@ -172,22 +214,10 @@ class YahooIE(InfoExtractor):
video_id = info['id']
return self._get_info(video_id, display_id, webpage)
- def _get_info(self, video_id, display_id, webpage):
- region = self._search_regex(
- r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
- webpage, 'region', fatal=False, default='US')
- data = compat_urllib_parse.urlencode({
- 'protocol': 'http',
- 'region': region,
- })
- query_url = (
- 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/'
- '{id}?{data}'.format(id=video_id, data=data))
- query_result = self._download_json(
- query_url, display_id, 'Downloading video info')
-
- info = query_result['query']['results']['mediaObj'][0]
+ def _extract_info(self, display_id, query, webpage):
+ info = query['query']['results']['mediaObj'][0]
meta = info.get('meta')
+ video_id = info.get('id')
if not meta:
msg = info['status'].get('msg')
@@ -213,12 +243,31 @@ class YahooIE(InfoExtractor):
'ext': 'flv',
})
else:
+ if s.get('format') == 'm3u8_playlist':
+ format_info['protocol'] = 'm3u8_native'
+ format_info['ext'] = 'mp4'
format_url = compat_urlparse.urljoin(host, path)
format_info['url'] = format_url
formats.append(format_info)
self._sort_formats(formats)
+ closed_captions = self._html_search_regex(
+ r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
+ default='[]')
+
+ cc_json = self._parse_json(closed_captions, video_id, fatal=False)
+ subtitles = {}
+ if cc_json:
+ for closed_caption in cc_json:
+ lang = closed_caption['lang']
+ if lang not in subtitles:
+ subtitles[lang] = []
+ subtitles[lang].append({
+ 'url': closed_caption['url'],
+ 'ext': mimetype2ext(closed_caption['content_type']),
+ })
+
return {
'id': video_id,
'display_id': display_id,
@@ -227,8 +276,24 @@ class YahooIE(InfoExtractor):
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
'duration': int_or_none(meta.get('duration')),
+ 'subtitles': subtitles,
}
+ def _get_info(self, video_id, display_id, webpage):
+ region = self._search_regex(
+ r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
+ webpage, 'region', fatal=False, default='US')
+ data = compat_urllib_parse.urlencode({
+ 'protocol': 'http',
+ 'region': region,
+ })
+ query_url = (
+ 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/'
+ '{id}?{data}'.format(id=video_id, data=data))
+ query_result = self._download_json(
+ query_url, display_id, 'Downloading video info')
+ return self._extract_info(display_id, query_result, webpage)
+
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = 'Yahoo screen search'
diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py
new file mode 100644
index 000000000..001ee17b6
--- /dev/null
+++ b/youtube_dl/extractor/yam.py
@@ -0,0 +1,123 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ float_or_none,
+ month_by_abbreviation,
+ ExtractorError,
+ get_element_by_attribute,
+)
+
+
+class YamIE(InfoExtractor):
+ IE_DESC = '蕃薯藤yam天空部落'
+ _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)'
+
+ _TESTS = [{
+ # An audio hosted on Yam
+ 'url': 'http://mymedia.yam.com/m/2283921',
+ 'md5': 'c011b8e262a52d5473d9c2e3c9963b9c',
+ 'info_dict': {
+ 'id': '2283921',
+ 'ext': 'mp3',
+ 'title': '發現 - 趙薇 京華煙雲主題曲',
+ 'description': '發現 - 趙薇 京華煙雲主題曲',
+ 'uploader_id': 'princekt',
+ 'upload_date': '20080807',
+ 'duration': 313.0,
+ }
+ }, {
+ # An external video hosted on YouTube
+ 'url': 'http://mymedia.yam.com/m/3599430',
+ 'md5': '03127cf10d8f35d120a9e8e52e3b17c6',
+ 'info_dict': {
+ 'id': 'CNpEoQlrIgA',
+ 'ext': 'mp4',
+ 'upload_date': '20150306',
+ 'uploader': '新莊社大瑜伽社',
+ 'description': 'md5:11e2e405311633ace874f2e6226c8b17',
+ 'uploader_id': '2323agoy',
+ 'title': '20090412陽明山二子坪-1',
+ },
+ 'skip': 'Video does not exist',
+ }, {
+ 'url': 'http://mymedia.yam.com/m/3598173',
+ 'info_dict': {
+ 'id': '3598173',
+ 'ext': 'mp4',
+ },
+ 'skip': 'cause Yam system error',
+ }, {
+ 'url': 'http://mymedia.yam.com/m/3599437',
+ 'info_dict': {
+ 'id': '3599437',
+ 'ext': 'mp4',
+ },
+ 'skip': 'invalid YouTube URL',
+ }, {
+ 'url': 'http://mymedia.yam.com/m/2373534',
+ 'md5': '7ff74b91b7a817269d83796f8c5890b1',
+ 'info_dict': {
+ 'id': '2373534',
+ 'ext': 'mp3',
+ 'title': '林俊傑&蔡卓妍-小酒窩',
+ 'description': 'md5:904003395a0fcce6cfb25028ff468420',
+ 'upload_date': '20080928',
+ 'uploader_id': 'onliner2',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ page = self._download_webpage(url, video_id)
+
+ # Check for errors
+ system_msg = self._html_search_regex(
+ r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message',
+ default=None)
+ if system_msg:
+ raise ExtractorError(system_msg, expected=True)
+
+ # Is it hosted externally on YouTube?
+ youtube_url = self._html_search_regex(
+ r'<embed src="(http://www.youtube.com/[^"]+)"',
+ page, 'YouTube url', default=None)
+ if youtube_url:
+ return self.url_result(youtube_url, 'Youtube')
+
+ title = self._html_search_regex(
+ r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
+
+ api_page = self._download_webpage(
+ 'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
+ note='Downloading API page')
+ api_result_obj = compat_urlparse.parse_qs(api_page)
+
+ info_table = get_element_by_attribute('class', 'info', page)
+ uploader_id = self._html_search_regex(
+ r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"',
+ info_table, 'uploader id', fatal=False)
+ mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
+ r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
+ if mobj:
+ upload_date = '%s%02d%02d' % (
+ mobj.group('year'),
+ month_by_abbreviation(mobj.group('mon')),
+ int(mobj.group('day')))
+ else:
+ upload_date = None
+ duration = float_or_none(api_result_obj['totaltime'][0], scale=1000)
+
+ return {
+ 'id': video_id,
+ 'url': api_result_obj['mp3file'][0],
+ 'title': title,
+ 'description': self._html_search_meta('description', page),
+ 'duration': duration,
+ 'uploader_id': uploader_id,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
new file mode 100644
index 000000000..d3cc1a29f
--- /dev/null
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -0,0 +1,178 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import hashlib
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse,
+)
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ sanitized_Request,
+)
+
+
+class YandexMusicTrackIE(InfoExtractor):
+ IE_NAME = 'yandexmusic:track'
+ IE_DESC = 'Яндекс.Музыка - Трек'
+ _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://music.yandex.ru/album/540508/track/4878838',
+ 'md5': 'f496818aa2f60b6c0062980d2e00dc20',
+ 'info_dict': {
+ 'id': '4878838',
+ 'ext': 'mp3',
+ 'title': 'Carlo Ambrosio - Gypsy Eyes 1',
+ 'filesize': 4628061,
+ 'duration': 193.04,
+ }
+ }
+
+ def _get_track_url(self, storage_dir, track_id):
+ data = self._download_json(
+ 'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s'
+ % storage_dir,
+ track_id, 'Downloading track location JSON')
+
+ key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest()
+ storage = storage_dir.split('.')
+
+ return ('http://%s/get-mp3/%s/%s?track-id=%s&from=service-10-track&similarities-experiment=default'
+ % (data['host'], key, data['ts'] + data['path'], storage[1]))
+
+ def _get_track_info(self, track):
+ thumbnail = None
+ cover_uri = track.get('albums', [{}])[0].get('coverUri')
+ if cover_uri:
+ thumbnail = cover_uri.replace('%%', 'orig')
+ if not thumbnail.startswith('http'):
+ thumbnail = 'http://' + thumbnail
+ return {
+ 'id': track['id'],
+ 'ext': 'mp3',
+ 'url': self._get_track_url(track['storageDir'], track['id']),
+ 'title': '%s - %s' % (track['artists'][0]['name'], track['title']),
+ 'filesize': int_or_none(track.get('fileSize')),
+ 'duration': float_or_none(track.get('durationMs'), 1000),
+ 'thumbnail': thumbnail,
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ album_id, track_id = mobj.group('album_id'), mobj.group('id')
+
+ track = self._download_json(
+ 'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id),
+ track_id, 'Downloading track JSON')['track']
+
+ return self._get_track_info(track)
+
+
+class YandexMusicPlaylistBaseIE(InfoExtractor):
+ def _build_playlist(self, tracks):
+ return [
+ self.url_result(
+ 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id']))
+ for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)]
+
+
+class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
+ IE_NAME = 'yandexmusic:album'
+ IE_DESC = 'Яндекс.Музыка - Альбом'
+ _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)'
+
+ _TEST = {
+ 'url': 'http://music.yandex.ru/album/540508',
+ 'info_dict': {
+ 'id': '540508',
+ 'title': 'Carlo Ambrosio - Gypsy Soul (2009)',
+ },
+ 'playlist_count': 50,
+ }
+
+ def _real_extract(self, url):
+ album_id = self._match_id(url)
+
+ album = self._download_json(
+ 'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id,
+ album_id, 'Downloading album JSON')
+
+ entries = self._build_playlist(album['volumes'][0])
+
+ title = '%s - %s' % (album['artists'][0]['name'], album['title'])
+ year = album.get('year')
+ if year:
+ title += ' (%s)' % year
+
+ return self.playlist_result(entries, compat_str(album['id']), title)
+
+
+class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
+ IE_NAME = 'yandexmusic:playlist'
+ IE_DESC = 'Яндекс.Музыка - Плейлист'
+ _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
+ 'info_dict': {
+ 'id': '1245',
+ 'title': 'Что слушают Enter Shikari',
+ 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
+ },
+ 'playlist_count': 6,
+ }, {
+ # playlist exceeding the limit of 150 tracks shipped with webpage (see
+ # https://github.com/rg3/youtube-dl/issues/6666)
+ 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036',
+ 'info_dict': {
+ 'id': '1036',
+ 'title': 'Музыка 90-х',
+ },
+ 'playlist_count': 310,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ mu = self._parse_json(
+ self._search_regex(
+ r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
+ playlist_id)
+
+ playlist = mu['pageData']['playlist']
+ tracks, track_ids = playlist['tracks'], playlist['trackIds']
+
+ # tracks dictionary shipped with webpage is limited to 150 tracks,
+ # missing tracks should be retrieved manually.
+ if len(tracks) < len(track_ids):
+ present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')])
+ missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids)
+ request = sanitized_Request(
+ 'https://music.yandex.ru/handlers/track-entries.jsx',
+ compat_urllib_parse.urlencode({
+ 'entries': ','.join(missing_track_ids),
+ 'lang': mu.get('settings', {}).get('lang', 'en'),
+ 'external-domain': 'music.yandex.ru',
+ 'overembed': 'false',
+ 'sign': mu.get('authData', {}).get('user', {}).get('sign'),
+ 'strict': 'true',
+ }).encode('utf-8'))
+ request.add_header('Referer', url)
+ request.add_header('X-Requested-With', 'XMLHttpRequest')
+
+ missing_tracks = self._download_json(
+ request, playlist_id, 'Downloading missing tracks JSON', fatal=False)
+ if missing_tracks:
+ tracks.extend(missing_tracks)
+
+ return self.playlist_result(
+ self._build_playlist(tracks),
+ compat_str(playlist_id),
+ playlist['title'], playlist.get('description'))
diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py
new file mode 100644
index 000000000..834d860af
--- /dev/null
+++ b/youtube_dl/extractor/yinyuetai.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class YinYueTaiIE(InfoExtractor):
+ IE_NAME = 'yinyuetai:video'
+ IE_DESC = '音悦Tai'
+ _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://v.yinyuetai.com/video/2322376',
+ 'md5': '6e3abe28d38e3a54b591f9f040595ce0',
+ 'info_dict': {
+ 'id': '2322376',
+ 'ext': 'mp4',
+ 'title': '少女时代_PARTY_Music Video Teaser',
+ 'creator': '少女时代',
+ 'duration': 25,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://v.yinyuetai.com/video/h5/2322376',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._download_json(
+ 'http://ext.yinyuetai.com/main/get-h-mv-info?json=true&videoId=%s' % video_id, video_id,
+ 'Downloading mv info')['videoInfo']['coreVideoInfo']
+
+ if info['error']:
+ raise ExtractorError(info['errorMsg'], expected=True)
+
+ formats = [{
+ 'url': format_info['videoUrl'],
+ 'format_id': format_info['qualityLevel'],
+ 'format': format_info.get('qualityLevelName'),
+ 'filesize': format_info.get('fileSize'),
+ # though URLs ends with .flv, the downloaded files are in fact mp4
+ 'ext': 'mp4',
+ 'tbr': format_info.get('bitrate'),
+ } for format_info in info['videoUrlModels']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': info['videoName'],
+ 'thumbnail': info.get('bigHeadImage'),
+ 'creator': info.get('artistNames'),
+ 'duration': info.get('duration'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py
index 894678a23..869f3e819 100644
--- a/youtube_dl/extractor/ynet.py
+++ b/youtube_dl/extractor/ynet.py
@@ -5,7 +5,7 @@ import re
import json
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_unquote_plus
class YnetIE(InfoExtractor):
@@ -34,7 +34,7 @@ class YnetIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage))
+ content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage))
config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config'))
f4m_url = config['clip']['url']
title = self._og_search_title(webpage)
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 97b98bbe8..69ecc837a 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -1,123 +1,254 @@
# coding: utf-8
-
from __future__ import unicode_literals
-import math
-import random
-import re
-import time
+import base64
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse,
+ compat_ord,
+)
from ..utils import (
ExtractorError,
+ sanitized_Request,
)
class YoukuIE(InfoExtractor):
+ IE_NAME = 'youku'
+ IE_DESC = '优酷'
_VALID_URL = r'''(?x)
(?:
http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
youku:)
(?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
'''
- _TEST = {
- 'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html',
- 'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b',
- 'params': {
- 'test': False
+
+ _TESTS = [{
+ 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
+ 'md5': '5f3af4192eabacc4501508d54a8cabd7',
+ 'info_dict': {
+ 'id': 'XMTc1ODE5Njcy_part1',
+ 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
+ 'ext': 'flv'
+ }
+ }, {
+ 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
+ 'info_dict': {
+ 'id': 'XODgxNjg1Mzk2',
+ 'title': '武媚娘传奇 85',
+ },
+ 'playlist_count': 11,
+ }, {
+ 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
+ 'info_dict': {
+ 'id': 'XMTI1OTczNDM5Mg',
+ 'title': '花千骨 04',
},
+ 'playlist_count': 13,
+ 'skip': 'Available in China only',
+ }, {
+ 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',
+ 'note': 'Video protected with password',
'info_dict': {
- 'id': 'XNDgyMDQ2NTQw_part00',
- 'ext': 'flv',
- 'title': 'youtube-dl test video "\'/\\ä↭𝕐'
+ 'id': 'XNjA1NzA2Njgw',
+ 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起',
+ },
+ 'playlist_count': 19,
+ 'params': {
+ 'videopassword': '100600',
+ },
+ }]
+
+ def construct_video_urls(self, data1, data2):
+ # get sid, token
+ def yk_t(s1, s2):
+ ls = list(range(256))
+ t = 0
+ for i in range(256):
+ t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
+ ls[i], ls[t] = ls[t], ls[i]
+ s = bytearray()
+ x, y = 0, 0
+ for i in range(len(s2)):
+ y = (y + 1) % 256
+ x = (x + ls[y]) % 256
+ ls[x], ls[y] = ls[y], ls[x]
+ s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
+ return bytes(s)
+
+ sid, token = yk_t(
+ b'becaf9be', base64.b64decode(data2['ep'].encode('ascii'))
+ ).decode('ascii').split('_')
+
+ # get oip
+ oip = data2['ip']
+
+ # get fileid
+ string_ls = list(
+ 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890')
+ shuffled_string_ls = []
+ seed = data1['seed']
+ N = len(string_ls)
+ for ii in range(N):
+ seed = (seed * 0xd3 + 0x754f) % 0x10000
+ idx = seed * len(string_ls) // 0x10000
+ shuffled_string_ls.append(string_ls[idx])
+ del string_ls[idx]
+
+ fileid_dict = {}
+ for format in data1['streamtypes']:
+ streamfileid = [
+ int(i) for i in data1['streamfileids'][format].strip('*').split('*')]
+ fileid = ''.join(
+ [shuffled_string_ls[i] for i in streamfileid])
+ fileid_dict[format] = fileid[:8] + '%s' + fileid[10:]
+
+ def get_fileid(format, n):
+ fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2)
+ return fileid
+
+ # get ep
+ def generate_ep(format, n):
+ fileid = get_fileid(format, n)
+ ep_t = yk_t(
+ b'bf7e5f01',
+ ('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
+ )
+ ep = base64.b64encode(ep_t).decode('ascii')
+ return ep
+
+ # generate video_urls
+ video_urls_dict = {}
+ for format in data1['streamtypes']:
+ video_urls = []
+ for dt in data1['segs'][format]:
+ n = str(int(dt['no']))
+ param = {
+ 'K': dt['k'],
+ 'hd': self.get_hd(format),
+ 'myp': 0,
+ 'ts': dt['seconds'],
+ 'ypp': 0,
+ 'ctype': 12,
+ 'ev': 1,
+ 'token': token,
+ 'oip': oip,
+ 'ep': generate_ep(format, n)
+ }
+ video_url = \
+ 'http://k.youku.com/player/getFlvPath/' + \
+ 'sid/' + sid + \
+ '_' + str(int(n) + 1).zfill(2) + \
+ '/st/' + self.parse_ext_l(format) + \
+ '/fileid/' + get_fileid(format, n) + '?' + \
+ compat_urllib_parse.urlencode(param)
+ video_urls.append(video_url)
+ video_urls_dict[format] = video_urls
+
+ return video_urls_dict
+
+ def get_hd(self, fm):
+ hd_id_dict = {
+ 'flv': '0',
+ 'mp4': '1',
+ 'hd2': '2',
+ 'hd3': '3',
+ '3gp': '0',
+ '3gphd': '1'
+ }
+ return hd_id_dict[fm]
+
+ def parse_ext_l(self, fm):
+ ext_dict = {
+ 'flv': 'flv',
+ 'mp4': 'mp4',
+ 'hd2': 'flv',
+ 'hd3': 'flv',
+ '3gp': 'flv',
+ '3gphd': 'mp4'
+ }
+ return ext_dict[fm]
+
+ def get_format_name(self, fm):
+ _dict = {
+ '3gp': 'h6',
+ '3gphd': 'h5',
+ 'flv': 'h4',
+ 'mp4': 'h3',
+ 'hd2': 'h2',
+ 'hd3': 'h1'
}
- }
-
- def _gen_sid(self):
- nowTime = int(time.time() * 1000)
- random1 = random.randint(1000, 1998)
- random2 = random.randint(1000, 9999)
-
- return "%d%d%d" % (nowTime, random1, random2)
-
- def _get_file_ID_mix_string(self, seed):
- mixed = []
- source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
- seed = float(seed)
- for i in range(len(source)):
- seed = (seed * 211 + 30031) % 65536
- index = math.floor(seed / 65536 * len(source))
- mixed.append(source[int(index)])
- source.remove(source[int(index)])
- # return ''.join(mixed)
- return mixed
-
- def _get_file_id(self, fileId, seed):
- mixed = self._get_file_ID_mix_string(seed)
- ids = fileId.split('*')
- realId = []
- for ch in ids:
- if ch:
- realId.append(mixed[int(ch)])
- return ''.join(realId)
+ return _dict[fm]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
- info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
+ def retrieve_data(req_url, note):
+ req = sanitized_Request(req_url)
- config = self._download_json(info_url, video_id)
+ cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+ if cn_verification_proxy:
+ req.add_header('Ytdl-request-proxy', cn_verification_proxy)
- error_code = config['data'][0].get('error_code')
- if error_code:
- # -8 means blocked outside China.
- error = config['data'][0].get('error') # Chinese and English, separated by newline.
- raise ExtractorError(error or 'Server reported error %i' % error_code,
- expected=True)
+ raw_data = self._download_json(req, video_id, note=note)
+ return raw_data['data'][0]
+
+ video_password = self._downloader.params.get('videopassword', None)
- video_title = config['data'][0]['title']
- seed = config['data'][0]['seed']
+ # request basic data
+ basic_data_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id
+ if video_password:
+ basic_data_url += '?password=%s' % video_password
- format = self._downloader.params.get('format', None)
- supported_format = list(config['data'][0]['streamfileids'].keys())
+ data1 = retrieve_data(
+ basic_data_url,
+ 'Downloading JSON metadata 1')
+ data2 = retrieve_data(
+ 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id,
+ 'Downloading JSON metadata 2')
- # TODO proper format selection
- if format is None or format == 'best':
- if 'hd2' in supported_format:
- format = 'hd2'
+ error_code = data1.get('error_code')
+ if error_code:
+ error = data1.get('error')
+ if error is not None and '因版权原因无法观看此视频' in error:
+ raise ExtractorError(
+ 'Youku said: Sorry, this video is available in China only', expected=True)
else:
- format = 'flv'
- ext = 'flv'
- elif format == 'worst':
- format = 'mp4'
- ext = 'mp4'
- else:
- format = 'flv'
- ext = 'flv'
-
- fileid = config['data'][0]['streamfileids'][format]
- keys = [s['k'] for s in config['data'][0]['segs'][format]]
- # segs is usually a dictionary, but an empty *list* if an error occured.
-
- files_info = []
- sid = self._gen_sid()
- fileid = self._get_file_id(fileid, seed)
-
- # column 8,9 of fileid represent the segment number
- # fileid[7:9] should be changed
- for index, key in enumerate(keys):
- temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
- download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
-
- info = {
- 'id': '%s_part%02d' % (video_id, index),
- 'url': download_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': ext,
- }
- files_info.append(info)
-
- return files_info
+ msg = 'Youku server reported error %i' % error_code
+ if error is not None:
+ msg += ': ' + error
+ raise ExtractorError(msg)
+
+ title = data1['title']
+
+ # generate video_urls_dict
+ video_urls_dict = self.construct_video_urls(data1, data2)
+
+ # construct info
+ entries = [{
+ 'id': '%s_part%d' % (video_id, i + 1),
+ 'title': title,
+ 'formats': [],
+ # some formats are not available for all parts, we have to detect
+ # which one has all
+ } for i in range(max(len(v) for v in data1['segs'].values()))]
+ for fm in data1['streamtypes']:
+ video_urls = video_urls_dict[fm]
+ for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries):
+ entry['formats'].append({
+ 'url': video_url,
+ 'format_id': self.get_format_name(fm),
+ 'ext': self.parse_ext_l(fm),
+ 'filesize': int(seg['size']),
+ })
+
+ return {
+ '_type': 'multi_video',
+ 'id': video_id,
+ 'title': title,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 107c9ac36..dd724085a 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -1,120 +1,171 @@
from __future__ import unicode_literals
-
-import json
import re
-import sys
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
- compat_urllib_request,
-)
from ..utils import (
- ExtractorError,
+ int_or_none,
+ sanitized_Request,
+ str_to_int,
unescapeHTML,
unified_strdate,
)
-from ..aes import (
- aes_decrypt_text
-)
+from ..aes import aes_decrypt_text
class YouPornIE(InfoExtractor):
- _VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+ _TESTS = [{
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+ 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
'info_dict': {
'id': '505835',
+ 'display_id': 'sex-ed-is-it-safe-to-masturbate-daily',
'ext': 'mp4',
- 'upload_date': '20101221',
+ 'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
+ 'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'Ask Dan And Jennifer',
- 'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
+ 'upload_date': '20101221',
+ 'average_rating': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'categories': list,
+ 'tags': list,
'age_limit': 18,
- }
- }
+ },
+ }, {
+ # Anonymous User uploader
+ 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
+ 'info_dict': {
+ 'id': '561726',
+ 'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show',
+ 'ext': 'mp4',
+ 'title': 'Big Tits Awesome Brunette On amazing webcam show',
+ 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Anonymous User',
+ 'upload_date': '20111125',
+ 'average_rating': int,
+ 'view_count': int,
+ 'comment_count': int,
+ 'categories': list,
+ 'tags': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = mobj.group('proto') + 'www.' + mobj.group('url')
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
- req = compat_urllib_request.Request(url)
- req.add_header('Cookie', 'age_verified=1')
- webpage = self._download_webpage(req, video_id)
- age_limit = self._rta_search(webpage)
+ request = sanitized_Request(url)
+ request.add_header('Cookie', 'age_verified=1')
+ webpage = self._download_webpage(request, display_id)
+
+ title = self._search_regex(
+ [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>.+?)\1',
+ r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'],
+ webpage, 'title', group='title')
- # Get JSON parameters
- json_params = self._search_regex(
- r'var currentVideo = new Video\((.*)\)[,;]',
- webpage, 'JSON parameters')
- try:
- params = json.loads(json_params)
- except:
- raise ExtractorError('Invalid JSON')
-
- self.report_extraction(video_id)
- try:
- video_title = params['title']
- upload_date = unified_strdate(params['release_date_f'])
- video_description = params['description']
- video_uploader = params['submitted_by']
- thumbnail = params['thumbnails'][0]['image']
- except KeyError:
- raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
-
- # Get all of the links from the page
- DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
- download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
- webpage, 'download list').strip()
- LINK_RE = r'<a href="([^"]+)">'
- links = re.findall(LINK_RE, download_list_html)
-
- # Get all encrypted links
- encrypted_links = re.findall(r'var encryptedQuality[0-9]{3}URL = \'([a-zA-Z0-9+/]+={0,2})\';', webpage)
- for encrypted_link in encrypted_links:
- link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8')
+ links = []
+
+ sources = self._search_regex(
+ r'sources\s*:\s*({.+?})', webpage, 'sources', default=None)
+ if sources:
+ for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
+ links.append(link)
+
+ # Fallback #1
+ for _, link in re.findall(
+ r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
+ links.append(link)
+
+ # Fallback #2, this also contains extra low quality 180p format
+ for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
links.append(link)
+ # Fallback #3, encrypted links
+ for _, encrypted_link in re.findall(
+ r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
+ links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))
+
formats = []
- for link in links:
- # A link looks like this:
- # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
- # A path looks like this:
- # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
- video_url = unescapeHTML(link)
- path = compat_urllib_parse_urlparse(video_url).path
- format_parts = path.split('/')[4].split('_')[:2]
-
- dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0]
-
- resolution = format_parts[0]
- height = int(resolution[:-len('p')])
- bitrate = int(format_parts[1][:-len('k')])
- format = '-'.join(format_parts) + '-' + dn
-
- formats.append({
+ for video_url in set(unescapeHTML(link) for link in links):
+ f = {
'url': video_url,
- 'format': format,
- 'format_id': format,
- 'height': height,
- 'tbr': bitrate,
- 'resolution': resolution,
- })
-
+ }
+ # Video URL's path looks like this:
+ # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+ # We will benefit from it by extracting some metadata
+ mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+ if mobj:
+ height = int(mobj.group('height'))
+ bitrate = int(mobj.group('bitrate'))
+ f.update({
+ 'format_id': '%dp-%dk' % (height, bitrate),
+ 'height': height,
+ 'tbr': bitrate,
+ })
+ formats.append(f)
self._sort_formats(formats)
- if not formats:
- raise ExtractorError('ERROR: no known formats available for video')
+ description = self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']video-description["\'][^>]*>(.+?)</div>',
+ webpage, 'description', default=None)
+ thumbnail = self._search_regex(
+ r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
+ webpage, 'thumbnail', fatal=False, group='thumbnail')
+
+ uploader = self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']videoInfoBy["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>',
+ webpage, 'uploader', fatal=False)
+ upload_date = unified_strdate(self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>',
+ webpage, 'upload date', fatal=False))
+
+ age_limit = self._rta_search(webpage)
+
+ average_rating = int_or_none(self._search_regex(
+ r'<div[^>]+class=["\']videoInfoRating["\'][^>]*>\s*<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
+ webpage, 'average rating', fatal=False))
+
+ view_count = str_to_int(self._search_regex(
+ r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>',
+ webpage, 'view count', fatal=False))
+ comment_count = str_to_int(self._search_regex(
+ r'>All [Cc]omments? \(([\d,.]+)\)',
+ webpage, 'comment count', fatal=False))
+
+ def extract_tag_box(title):
+ tag_box = self._search_regex(
+ (r'<div[^>]+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?</div>\s*'
+ '<div[^>]+class=["\']tagBoxContent["\']>(.+?)</div>') % re.escape(title),
+ webpage, '%s tag box' % title, default=None)
+ if not tag_box:
+ return []
+ return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box)
+
+ categories = extract_tag_box('Category')
+ tags = extract_tag_box('Tags')
return {
'id': video_id,
- 'uploader': video_uploader,
- 'upload_date': upload_date,
- 'title': video_title,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
'thumbnail': thumbnail,
- 'description': video_description,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'average_rating': average_rating,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'categories': categories,
+ 'tags': tags,
'age_limit': age_limit,
'formats': formats,
}
diff --git a/youtube_dl/extractor/yourupload.py b/youtube_dl/extractor/yourupload.py
index 40fc4165f..4e25d6f22 100644
--- a/youtube_dl/extractor/yourupload.py
+++ b/youtube_dl/extractor/yourupload.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -16,7 +14,7 @@ class YourUploadIE(InfoExtractor):
_TESTS = [
{
'url': 'http://yourupload.com/watch/14i14h',
- 'md5': 'bf5c2f95c4c917536e80936af7bc51e1',
+ 'md5': '5e2c63385454c557f97c4c4131a393cd',
'info_dict': {
'id': '14i14h',
'ext': 'mp4',
@@ -35,24 +33,21 @@ class YourUploadIE(InfoExtractor):
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
- url = 'http://embed.yucache.net/{0:}'.format(video_id)
- webpage = self._download_webpage(url, video_id)
+ embed_url = 'http://embed.yucache.net/{0:}'.format(video_id)
+ webpage = self._download_webpage(embed_url, video_id)
title = self._og_search_title(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
- url = self._og_search_video_url(webpage)
-
- formats = [{
- 'format_id': 'sd',
- 'url': url,
- }]
+ video_url = self._og_search_video_url(webpage)
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
return {
'id': video_id,
'title': title,
- 'formats': formats,
+ 'url': video_url,
'thumbnail': thumbnail,
+ 'http_headers': {
+ 'Referer': embed_url,
+ },
}
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 35ef4c303..52f4fe36d 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -11,36 +11,44 @@ import time
import traceback
from .common import InfoExtractor, SearchInfoExtractor
-from .subtitles import SubtitlesInfoExtractor
from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter
from ..compat import (
compat_chr,
compat_parse_qs,
compat_urllib_parse,
- compat_urllib_request,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_unquote_plus,
+ compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
)
from ..utils import (
clean_html,
+ encode_dict,
ExtractorError,
float_or_none,
get_element_by_attribute,
get_element_by_id,
int_or_none,
- OnDemandPagedList,
orderedSet,
+ parse_duration,
+ remove_start,
+ sanitized_Request,
+ smuggle_url,
+ str_to_int,
unescapeHTML,
unified_strdate,
+ unsmuggle_url,
uppercase_escape,
+ ISO3166Utils,
)
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
- _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
+ _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
@@ -51,6 +59,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# YouTube sets the expire time to about two months
expire_time=time.time() + 2 * 30 * 24 * 3600)
+ def _ids_to_results(self, ids):
+ return [
+ self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+
def _login(self):
"""
Attempt to log in to YouTube.
@@ -99,12 +112,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'hl': 'en_US',
}
- # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
- # chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
- login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
+ login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
- req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+ req = sanitized_Request(self._LOGIN_URL, login_data)
login_results = self._download_webpage(
req, None,
note='Logging in', errnote='unable to log in', fatal=False)
@@ -117,44 +127,27 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# Two-Factor
# TODO add SMS and phone call support - these require making a request and then prompting the user
- if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
- tfa_code = self._get_tfa_info()
+ if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None:
+ tfa_code = self._get_tfa_info('2-step verification code')
- if tfa_code is None:
- self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
- self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ if not tfa_code:
+ self._downloader.report_warning(
+ 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+ '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
return False
- # Unlike the first login form, secTok and timeStmp are both required for the TFA form
-
- match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
- if match is None:
- self._downloader.report_warning('Failed to get secTok - did the page structure change?')
- secTok = match.group(1)
- match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
- if match is None:
- self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
- timeStmp = match.group(1)
-
- tfa_form_strs = {
- 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- 'smsToken': '',
- 'smsUserPin': tfa_code,
- 'smsVerifyPin': 'Verify',
-
- 'PersistentCookie': 'yes',
- 'checkConnection': '',
- 'checkedDomains': 'youtube',
- 'pstMsg': '1',
- 'secTok': secTok,
- 'timeStmp': timeStmp,
- 'service': 'youtube',
- 'hl': 'en_US',
- }
- tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
- tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
+ tfa_code = remove_start(tfa_code, 'G-')
- tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
+ tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
+
+ tfa_form_strs.update({
+ 'Pin': tfa_code,
+ 'TrustDevice': 'on',
+ })
+
+ tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
+
+ tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
tfa_results = self._download_webpage(
tfa_req, None,
note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
@@ -162,8 +155,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if tfa_results is False:
return False
- if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
- self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
+ if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None:
+ self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
return False
if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
self._downloader.report_warning('unable to log in - did the page structure change?')
@@ -185,7 +178,70 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return
-class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
+class YoutubeEntryListBaseInfoExtractor(InfoExtractor):
+ # Extract entries from page with "Load more" button
+ def _entries(self, page, playlist_id):
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ for entry in self._process_page(content_html):
+ yield entry
+
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ if not content_html.strip():
+ # Some webpages show a "Load more" button but they don't
+ # have more videos
+ break
+ more_widget_html = more['load_more_widget_html']
+
+
+class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for video_id, video_title in self.extract_videos_from_page(content):
+ yield self.url_result(video_id, 'Youtube', video_id, video_title)
+
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+ for mobj in re.finditer(self._VIDEO_RE, page):
+ # The link with index 0 is not the first video of the playlist (not sure if still actual)
+ if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+ continue
+ video_id = mobj.group('id')
+ video_title = unescapeHTML(mobj.group('title'))
+ if video_title:
+ video_title = video_title.strip()
+ try:
+ idx = ids_in_page.index(video_id)
+ if video_title and not titles_in_page[idx]:
+ titles_in_page[idx] = video_title
+ except ValueError:
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+ return zip(ids_in_page, titles_in_page)
+
+
+class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+ def _process_page(self, content):
+ for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content):
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ webpage = self._download_webpage(url, playlist_id)
+ title = self._og_search_title(webpage, fatal=False)
+ return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+
+
+class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^
(
@@ -202,11 +258,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
- (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
+ (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
))
- |youtu\.be/ # just youtu.be/xxxx
+ |(?:
+ youtu\.be| # just youtu.be/xxxx
+ vid\.plus # or vid.plus/xxxx
+ )/
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
@@ -231,6 +290,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'44': {'ext': 'webm', 'width': 854, 'height': 480},
'45': {'ext': 'webm', 'width': 1280, 'height': 720},
'46': {'ext': 'webm', 'width': 1920, 'height': 1080},
+ '59': {'ext': 'mp4', 'width': 854, 'height': 480},
+ '78': {'ext': 'mp4', 'width': 854, 'height': 480},
# 3d videos
@@ -270,13 +331,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
# Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
- '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
+ '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'},
'242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
@@ -286,11 +347,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
'272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
- '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
- '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
- '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
- '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'},
- '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'},
# Dash webm audio
'171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
@@ -308,7 +369,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
IE_NAME = 'youtube'
_TESTS = [
{
- 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
+ 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9',
'info_dict': {
'id': 'BaW_jenozKc',
'ext': 'mp4',
@@ -318,8 +379,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'upload_date': '20121002',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
'like_count': int,
'dislike_count': int,
+ 'start_time': 1,
+ 'end_time': 9,
}
},
{
@@ -330,7 +394,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'ext': 'mp4',
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
- 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
+ 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
+ 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
+ 'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
}
@@ -346,6 +413,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'description': 'md5:64249768eec3bc4276236606ea996373',
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
+ 'age_limit': 18,
}
},
{
@@ -358,10 +426,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
'uploader': 'SET India',
- 'uploader_id': 'setindia'
+ 'uploader_id': 'setindia',
+ 'age_limit': 18,
}
},
{
+ 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY',
+ 'note': 'Use the first video ID in the URL',
+ 'info_dict': {
+ 'id': 'BaW_jenozKc',
+ 'ext': 'mp4',
+ 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+ 'uploader': 'Philipp Hagemeister',
+ 'uploader_id': 'phihag',
+ 'upload_date': '20121002',
+ 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+ 'categories': ['Science & Technology'],
+ 'tags': ['youtube-dl'],
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
'note': '256k DASH audio (format 141) via DASH manifest',
'info_dict': {
@@ -402,7 +491,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'id': 'nfWlot6h_JM',
'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off',
- 'description': 'md5:2acfda1b285bdd478ccec22f9918199d',
+ 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
@@ -436,6 +525,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader': 'The Witcher',
'uploader_id': 'WitcherGame',
'upload_date': '20140605',
+ 'age_limit': 18,
},
},
# Age-gate video with encrypted signature
@@ -449,6 +539,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader': 'LloydVEVO',
'uploader_id': 'LloydVEVO',
'upload_date': '20110629',
+ 'age_limit': 18,
},
},
# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
@@ -473,7 +564,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'info_dict': {
'id': 'lqQg6PlCWgI',
'ext': 'mp4',
- 'upload_date': '20120731',
+ 'upload_date': '20150827',
'uploader_id': 'olympic',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympics',
@@ -496,7 +587,149 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'uploader': '孫艾倫',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
},
- }
+ },
+ # url_encoded_fmt_stream_map is empty string
+ {
+ 'url': 'qEJwOuvDf7I',
+ 'info_dict': {
+ 'id': 'qEJwOuvDf7I',
+ 'ext': 'webm',
+ 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
+ 'description': '',
+ 'upload_date': '20150404',
+ 'uploader_id': 'spbelect',
+ 'uploader': 'Наблюдатели Петербурга',
+ },
+ 'params': {
+ 'skip_download': 'requires avconv',
+ }
+ },
+ # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097)
+ {
+ 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+ 'info_dict': {
+ 'id': 'FIl7x6_3R5Y',
+ 'ext': 'mp4',
+ 'title': 'md5:7b81415841e02ecd4313668cde88737a',
+ 'description': 'md5:116377fd2963b81ec4ce64b542173306',
+ 'upload_date': '20150625',
+ 'uploader_id': 'dorappi2000',
+ 'uploader': 'dorappi2000',
+ 'formats': 'mincount:33',
+ },
+ },
+ # DASH manifest with segment_list
+ {
+ 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
+ 'md5': '8ce563a1d667b599d21064e982ab9e31',
+ 'info_dict': {
+ 'id': 'CsmdDsKjzN8',
+ 'ext': 'mp4',
+ 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
+ 'uploader': 'Airtek',
+ 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
+ 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+ 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '135', # bestvideo
+ }
+ },
+ {
+ # Multifeed videos (multiple cameras), URL is for Main Camera
+ 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'title': 'teamPGP: Rocket League Noob Stream',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'jqWvoWXjCVs',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': '6h8e8xoXJzg',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'PUOgX5z9xZw',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }, {
+ 'info_dict': {
+ 'id': 'teuwxikvS5k',
+ 'ext': 'mp4',
+ 'title': 'teamPGP: Rocket League Noob Stream (zim)',
+ 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'upload_date': '20150721',
+ 'uploader': 'Beer Games Beer',
+ 'uploader_id': 'beergamesbeer',
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://vid.plus/FlRa-iH7PGw',
+ 'only_matching': True,
+ },
+ {
+ # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
+ 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
+ 'info_dict': {
+ 'id': 'lsguqyKfVQg',
+ 'ext': 'mp4',
+ 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
+ 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
+ 'upload_date': '20151119',
+ 'uploader_id': 'IronSoulElf',
+ 'uploader': 'IronSoulElf',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468)
+ 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
+ 'only_matching': True,
+ },
+ {
+ # Video with yt:stretch=17:0
+ 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
+ 'info_dict': {
+ 'id': 'Q39EVAstoRM',
+ 'ext': 'mp4',
+ 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
+ 'description': 'md5:ee18a25c350637c8faff806845bddee9',
+ 'upload_date': '20151107',
+ 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
+ 'uploader': 'CH GAMER DROID',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
def __init__(self, *args, **kwargs):
@@ -525,7 +758,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -541,26 +774,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
+ download_note = (
+ 'Downloading player %s' % player_url
+ if self._downloader.params.get('verbose') else
+ 'Downloading %s player %s' % (player_type, player_id)
+ )
if player_type == 'js':
code = self._download_webpage(
player_url, video_id,
- note='Downloading %s player %s' % (player_type, player_id),
+ note=download_note,
errnote='Download of %s failed' % player_url)
res = self._parse_sig_js(code)
elif player_type == 'swf':
urlh = self._request_webpage(
player_url, video_id,
- note='Downloading %s player %s' % (player_type, player_id),
+ note=download_note,
errnote='Download of %s failed' % player_url)
code = urlh.read()
res = self._parse_sig_swf(code)
else:
assert False, 'Invalid player type %r' % player_type
- if cache_spec is None:
- test_string = ''.join(map(compat_chr, range(len(example_sig))))
- cache_res = res(test_string)
- cache_spec = [ord(c) for c in cache_res]
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
+ cache_res = res(test_string)
+ cache_spec = [ord(c) for c in cache_res]
self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
return res
@@ -644,7 +881,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e)
- def _get_available_subtitles(self, video_id, webpage):
+ def _get_subtitles(self, video_id, webpage):
try:
subs_doc = self._download_xml(
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@@ -658,30 +895,51 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
lang = track.attrib['lang_code']
if lang in sub_lang_list:
continue
- params = compat_urllib_parse.urlencode({
- 'lang': lang,
- 'v': video_id,
- 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
- 'name': track.attrib['name'].encode('utf-8'),
- })
- url = 'https://www.youtube.com/api/timedtext?' + params
- sub_lang_list[lang] = url
+ sub_formats = []
+ for ext in ['sbv', 'vtt', 'srt']:
+ params = compat_urllib_parse.urlencode({
+ 'lang': lang,
+ 'v': video_id,
+ 'fmt': ext,
+ 'name': track.attrib['name'].encode('utf-8'),
+ })
+ sub_formats.append({
+ 'url': 'https://www.youtube.com/api/timedtext?' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[lang] = sub_formats
if not sub_lang_list:
self._downloader.report_warning('video doesn\'t have subtitles')
return {}
return sub_lang_list
- def _get_available_automatic_caption(self, video_id, webpage):
+ def _get_ytplayer_config(self, video_id, webpage):
+ patterns = (
+ # User data may contain arbitrary character sequences that may affect
+ # JSON extraction with regex, e.g. when '};' is contained the second
+ # regex won't capture the whole JSON. Yet working around by trying more
+ # concrete regex first keeping in mind proper quoted string handling
+ # to be implemented in future that will replace this workaround (see
+ # https://github.com/rg3/youtube-dl/issues/7468,
+ # https://github.com/rg3/youtube-dl/pull/7599)
+ r';ytplayer\.config\s*=\s*({.+?});ytplayer',
+ r';ytplayer\.config\s*=\s*({.+?});',
+ )
+ config = self._search_regex(
+ patterns, webpage, 'ytplayer.config', default=None)
+ if config:
+ return self._parse_json(
+ uppercase_escape(config), video_id, fatal=False)
+
+ def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
- sub_format = self._downloader.params.get('subtitlesformat', 'srt')
self.to_screen('%s: Looking for automatic captions' % video_id)
- mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+ player_config = self._get_ytplayer_config(video_id, webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id
- if mobj is None:
+ if not player_config:
self._downloader.report_warning(err_msg)
return {}
- player_config = json.loads(mobj.group(1))
try:
args = player_config['args']
caption_url = args['ttsurl']
@@ -704,14 +962,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
sub_lang_list = {}
for lang_node in caption_list.findall('target'):
sub_lang = lang_node.attrib['lang_code']
- params = compat_urllib_parse.urlencode({
- 'lang': original_lang,
- 'tlang': sub_lang,
- 'fmt': sub_format,
- 'ts': timestamp,
- 'kind': caption_kind,
- })
- sub_lang_list[sub_lang] = caption_url + '&' + params
+ sub_formats = []
+ for ext in ['sbv', 'vtt', 'srt']:
+ params = compat_urllib_parse.urlencode({
+ 'lang': original_lang,
+ 'tlang': sub_lang,
+ 'fmt': ext,
+ 'ts': timestamp,
+ 'kind': caption_kind,
+ })
+ sub_formats.append({
+ 'url': caption_url + '&' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[sub_lang] = sub_formats
return sub_lang_list
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
@@ -747,56 +1011,95 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
def _parse_dash_manifest(
- self, video_id, dash_manifest_url, player_url, age_gate):
+ self, video_id, dash_manifest_url, player_url, age_gate, fatal=True):
def decrypt_sig(mobj):
s = mobj.group(1)
dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
return '/signature/%s' % dec_s
- dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
+ dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml(
dash_manifest_url, video_id,
note='Downloading DASH manifest',
- errnote='Could not download DASH manifest')
+ errnote='Could not download DASH manifest',
+ fatal=fatal)
+
+ if dash_doc is False:
+ return []
formats = []
- for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
- url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
- if url_el is None:
- continue
- format_id = r.attrib['id']
- video_url = url_el.text
- filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
- f = {
- 'format_id': format_id,
- 'url': video_url,
- 'width': int_or_none(r.attrib.get('width')),
- 'height': int_or_none(r.attrib.get('height')),
- 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
- 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
- 'filesize': filesize,
- 'fps': int_or_none(r.attrib.get('frameRate')),
- }
- try:
- existing_format = next(
- fo for fo in formats
- if fo['format_id'] == format_id)
- except StopIteration:
- full_info = self._formats.get(format_id, {}).copy()
- full_info.update(f)
- formats.append(full_info)
- else:
- existing_format.update(f)
+ for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
+ mime_type = a.attrib.get('mimeType')
+ for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+ if url_el is None:
+ continue
+ if mime_type == 'text/vtt':
+ # TODO implement WebVTT downloading
+ pass
+ elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
+ segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList')
+ format_id = r.attrib['id']
+ video_url = url_el.text
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+ f = {
+ 'format_id': format_id,
+ 'url': video_url,
+ 'width': int_or_none(r.attrib.get('width')),
+ 'height': int_or_none(r.attrib.get('height')),
+ 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+ 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+ 'filesize': filesize,
+ 'fps': int_or_none(r.attrib.get('frameRate')),
+ }
+ if segment_list is not None:
+ f.update({
+ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'],
+ 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')],
+ 'protocol': 'http_dash_segments',
+ })
+ try:
+ existing_format = next(
+ fo for fo in formats
+ if fo['format_id'] == format_id)
+ except StopIteration:
+ full_info = self._formats.get(format_id, {}).copy()
+ full_info.update(f)
+ codecs = r.attrib.get('codecs')
+ if codecs:
+ if full_info.get('acodec') == 'none' and 'vcodec' not in full_info:
+ full_info['vcodec'] = codecs
+ elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info:
+ full_info['acodec'] = codecs
+ formats.append(full_info)
+ else:
+ existing_format.update(f)
+ else:
+ self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
proto = (
'http' if self._downloader.params.get('prefer_insecure', False)
else 'https')
+ start_time = None
+ end_time = None
+ parsed_url = compat_urllib_parse_urlparse(url)
+ for component in [parsed_url.fragment, parsed_url.query]:
+ query = compat_parse_qs(component)
+ if start_time is None and 't' in query:
+ start_time = parse_duration(query['t'][0])
+ if start_time is None and 'start' in query:
+ start_time = parse_duration(query['start'][0])
+ if end_time is None and 'end' in query:
+ end_time = parse_duration(query['end'][0])
+
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
- url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+ url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
video_id = self.extract_id(url)
# Get video webpage
@@ -810,8 +1113,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
player_url = None
+ dash_mpds = []
+
+ def add_dash_mpd(video_info):
+ dash_mpd = video_info.get('dashmpd')
+ if dash_mpd and dash_mpd[0] not in dash_mpds:
+ dash_mpds.append(dash_mpd[0])
+
# Get video info
embed_webpage = None
+ is_live = None
if re.search(r'player-age-gate-content">', video_webpage) is not None:
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
@@ -830,24 +1141,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
+ add_dash_mpd(video_info)
else:
age_gate = False
- try:
- # Try looking directly into the video webpage
- mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
- if not mobj:
- raise ValueError('Could not find ytplayer.config') # caught below
- json_code = uppercase_escape(mobj.group(1))
- ytplayer_config = json.loads(json_code)
+ video_info = None
+ # Try looking directly into the video webpage
+ ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
+ if ytplayer_config:
args = ytplayer_config['args']
- # Convert to the same format returned by compat_parse_qs
- video_info = dict((k, [v]) for k, v in args.items())
- if 'url_encoded_fmt_stream_map' not in args:
- raise ValueError('No stream_map present') # caught below
- except ValueError:
- # We fallback to the get_video_info pages (used by the embed page)
+ if args.get('url_encoded_fmt_stream_map'):
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ add_dash_mpd(video_info)
+ if args.get('livestream') == '1' or args.get('live_playback') == 1:
+ is_live = True
+ if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+ # We also try looking in get_video_info since it may contain different dashmpd
+ # URL that points to a DASH manifest with possibly different itag set (some itags
+ # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
+ # manifest pointed by get_video_info's dashmpd).
+ # The general idea is to take a union of itags of both DASH manifests (for example
+ # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
self.report_video_info_webpage_download(video_id)
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = (
'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (proto, video_id, el_type))
@@ -855,11 +1171,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
video_info_url,
video_id, note=False,
errnote='unable to download video info webpage')
- video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
+ get_video_info = compat_parse_qs(video_info_webpage)
+ if get_video_info.get('use_cipher_signature') != ['True']:
+ add_dash_mpd(get_video_info)
+ if not video_info:
+ video_info = get_video_info
+ if 'token' in get_video_info:
+ # Different get_video_info requests may report different results, e.g.
+ # some may report video unavailability, but some may serve it without
+ # any complaint (see https://github.com/rg3/youtube-dl/issues/7362,
+ # the original webpage as well as el=info and el=embedded get_video_info
+ # requests report video unavailability due to geo restriction while
+ # el=detailpage succeeds and returns valid data). This is probably
+ # due to YouTube measures against IP ranges of hosting providers.
+ # Working around by preferring the first succeeded video_info containing
+ # the token if no such video_info yet was found.
+ if 'token' not in video_info:
+ video_info = get_video_info
break
if 'token' not in video_info:
if 'reason' in video_info:
+ if 'The uploader has not made this video available in your country.' in video_info['reason']:
+ regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None)
+ if regions_allowed:
+ raise ExtractorError('YouTube said: This video is available in %s only' % (
+ ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))),
+ expected=True)
raise ExtractorError(
'YouTube said: %s' % video_info['reason'][0],
expected=True, video_id=video_id)
@@ -868,6 +1205,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'"token" parameter not in video info for unknown reason',
video_id=video_id)
+ # title
+ if 'title' in video_info:
+ video_title = video_info['title'][0]
+ else:
+ self._downloader.report_warning('Unable to extract video title')
+ video_title = '_'
+
+ # description
+ video_description = get_element_by_id("eow-description", video_webpage)
+ if video_description:
+ video_description = re.sub(r'''(?x)
+ <a\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ title="([^"]+)"\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ class="yt-uix-redirect-link"\s*>
+ [^<]+
+ </a>
+ ''', r'\1', video_description)
+ video_description = clean_html(video_description)
+ else:
+ fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
+ if fd_mobj:
+ video_description = unescapeHTML(fd_mobj.group(1))
+ else:
+ video_description = ''
+
+ if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
+ if not self._downloader.params.get('noplaylist'):
+ entries = []
+ feed_ids = []
+ multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0])
+ for feed in multifeed_metadata_list.split(','):
+ feed_data = compat_parse_qs(feed)
+ entries.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ 'url': smuggle_url(
+ '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+ {'force_singlefeed': True}),
+ 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
+ })
+ feed_ids.append(feed_data['id'][0])
+ self.to_screen(
+ 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+ % (', '.join(feed_ids), video_id))
+ return self.playlist_result(entries, video_id, video_title, video_description)
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
if 'view_count' in video_info:
view_count = int(video_info['view_count'][0])
else:
@@ -883,7 +1269,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# uploader
if 'author' not in video_info:
raise ExtractorError('Unable to extract uploader name')
- video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+ video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
# uploader_id
video_uploader_id = None
@@ -893,13 +1279,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
self._downloader.report_warning('unable to extract uploader nickname')
- # title
- if 'title' in video_info:
- video_title = video_info['title'][0]
- else:
- self._downloader.report_warning('Unable to extract video title')
- video_title = '_'
-
# thumbnail image
# We try first to get a high quality image:
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
@@ -910,18 +1289,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
self._downloader.report_warning('unable to extract video thumbnail')
video_thumbnail = None
else: # don't panic if we can't find it
- video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
+ video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
# upload date
- upload_date = None
- mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
- if mobj is None:
- mobj = re.search(
- r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
- video_webpage)
- if mobj is not None:
- upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
- upload_date = unified_strdate(upload_date)
+ upload_date = self._html_search_meta(
+ 'datePublished', video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+ video_webpage, 'upload date', default=None)
+ if upload_date:
+ upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
+ upload_date = unified_strdate(upload_date)
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
@@ -934,48 +1314,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
video_categories = None
- # description
- video_description = get_element_by_id("eow-description", video_webpage)
- if video_description:
- video_description = re.sub(r'''(?x)
- <a\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- title="([^"]+)"\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- class="yt-uix-redirect-link"\s*>
- [^<]+
- </a>
- ''', r'\1', video_description)
- video_description = clean_html(video_description)
- else:
- fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
- if fd_mobj:
- video_description = unescapeHTML(fd_mobj.group(1))
- else:
- video_description = ''
+ video_tags = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
def _extract_count(count_name):
- count = self._search_regex(
- r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
- video_webpage, count_name, default=None)
- if count is not None:
- return int(count.replace(',', ''))
- return None
+ return str_to_int(self._search_regex(
+ r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+ % re.escape(count_name),
+ video_webpage, count_name, default=None))
+
like_count = _extract_count('like')
dislike_count = _extract_count('dislike')
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, video_webpage)
- return
+ automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
if 'length_seconds' not in video_info:
self._downloader.report_warning('unable to extract video duration')
video_duration = None
else:
- video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
+ video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0]))
# annotations
video_annotations = None
@@ -1007,7 +1367,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
- url_map = {}
+ formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' not in url_data or 'url' not in url_data:
@@ -1053,7 +1413,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- r'html5player-([^/]+?)(?:/html5player)?\.js',
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
@@ -1067,44 +1427,105 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
url += '&signature=' + signature
if 'ratebypass' not in url:
url += '&ratebypass=yes'
- url_map[format_id] = url
- formats = _map_to_format_list(url_map)
+
+ # Some itags are not included in DASH manifest thus corresponding formats will
+ # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
+ # Trying to extract metadata from url_encoded_fmt_stream_map entry.
+ mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
+ width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
+ dct = {
+ 'format_id': format_id,
+ 'url': url,
+ 'player_url': player_url,
+ 'filesize': int_or_none(url_data.get('clen', [None])[0]),
+ 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
+ 'width': width,
+ 'height': height,
+ 'fps': int_or_none(url_data.get('fps', [None])[0]),
+ 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
+ }
+ type_ = url_data.get('type', [None])[0]
+ if type_:
+ type_split = type_.split(';')
+ kind_ext = type_split[0].split('/')
+ if len(kind_ext) == 2:
+ kind, ext = kind_ext
+ dct['ext'] = ext
+ if kind in ('audio', 'video'):
+ codecs = None
+ for mobj in re.finditer(
+ r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
+ if mobj.group('key') == 'codecs':
+ codecs = mobj.group('val')
+ break
+ if codecs:
+ codecs = codecs.split(',')
+ if len(codecs) == 2:
+ acodec, vcodec = codecs[0], codecs[1]
+ else:
+ acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])
+ dct.update({
+ 'acodec': acodec,
+ 'vcodec': vcodec,
+ })
+ if format_id in self._formats:
+ dct.update(self._formats[format_id])
+ formats.append(dct)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
url_map = self._extract_from_m3u8(manifest_url, video_id)
formats = _map_to_format_list(url_map)
+ # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
+ for a_format in formats:
+ if 'http_headers' not in a_format:
+ a_format['http_headers'] = {}
+ a_format['http_headers']['Youtubedl-no-compression'] = True
else:
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
- dash_mpd = video_info.get('dashmpd')
- if dash_mpd:
- dash_manifest_url = dash_mpd[0]
+ dash_mpd_fatal = True
+ for dash_manifest_url in dash_mpds:
+ dash_formats = {}
try:
- dash_formats = self._parse_dash_manifest(
- video_id, dash_manifest_url, player_url, age_gate)
+ for df in self._parse_dash_manifest(
+ video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal):
+ # Do not overwrite DASH format found in some previous DASH manifest
+ if df['format_id'] not in dash_formats:
+ dash_formats[df['format_id']] = df
+ # Additional DASH manifests may end up in HTTP Error 403 therefore
+ # allow them to fail without bug report message if we already have
+ # some DASH manifest succeeded. This is temporary workaround to reduce
+ # burst of bug reports until we figure out the reason and whether it
+ # can be fixed at all.
+ dash_mpd_fatal = False
except (ExtractorError, KeyError) as e:
self.report_warning(
'Skipping DASH manifest: %r' % e, video_id)
- else:
- # Hide the formats we found through non-DASH
- dash_keys = set(df['format_id'] for df in dash_formats)
- for f in formats:
- if f['format_id'] in dash_keys:
- f['format_id'] = 'nondash-%s' % f['format_id']
- f['preference'] = f.get('preference', 0) - 10000
- formats.extend(dash_formats)
+ if dash_formats:
+ # Remove the formats we found through non-DASH, they
+ # contain less info and it can be wrong, because we use
+ # fixed values (for example the resolution). See
+ # https://github.com/rg3/youtube-dl/issues/5774 for an
+ # example.
+ formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+ formats.extend(dash_formats.values())
# Check for malformed aspect ratio
stretched_m = re.search(
r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
video_webpage)
if stretched_m:
- ratio = float(stretched_m.group('w')) / float(stretched_m.group('h'))
- for f in formats:
- if f.get('vcodec') != 'none':
- f['stretched_ratio'] = ratio
+ w = float(stretched_m.group('w'))
+ h = float(stretched_m.group('h'))
+ # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
+ # We will only process correct ratios.
+ if w > 0 and h > 0:
+ ratio = w / h
+ for f in formats:
+ if f.get('vcodec') != 'none':
+ f['stretched_ratio'] = ratio
self._sort_formats(formats)
@@ -1117,7 +1538,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'thumbnail': video_thumbnail,
'description': video_description,
'categories': video_categories,
+ 'tags': video_tags,
'subtitles': video_subtitles,
+ 'automatic_captions': automatic_captions,
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
@@ -1127,10 +1550,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'dislike_count': dislike_count,
'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
'formats': formats,
+ 'is_live': is_live,
+ 'start_time': start_time,
+ 'end_time': end_time,
}
-class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com playlists'
_VALID_URL = r"""(?x)(?:
(?:https?://)?
@@ -1142,16 +1568,16 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
| p/
)
(
- (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
+ (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.*
|
- ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
+ ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
)"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@@ -1227,13 +1653,8 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _ids_to_results(self, ids):
- return [
- self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
-
def _extract_mix(self, playlist_id):
- # The mixes are generated from a a single video
+ # The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
webpage = self._download_webpage(
@@ -1252,6 +1673,33 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
return self.playlist_result(url_results, playlist_id, title)
+ def _extract_playlist(self, playlist_id):
+ url = self._TEMPLATE_URL % playlist_id
+ page = self._download_webpage(url, playlist_id)
+
+ for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page):
+ match = match.strip()
+ # Check if the playlist exists or is private
+ if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match):
+ raise ExtractorError(
+ 'The playlist doesn\'t exist or is private, use --username or '
+ '--netrc to access it.',
+ expected=True)
+ elif re.match(r'[^<]*Invalid parameters[^<]*', match):
+ raise ExtractorError(
+ 'Invalid parameters. Maybe URL is incorrect.',
+ expected=True)
+ elif re.match(r'[^<]*Choose your language[^<]*', match):
+ continue
+ else:
+ self.report_warning('Youtube gives an alert message: ' + match)
+
+ playlist_title = self._html_search_regex(
+ r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
+ page, 'title')
+
+ return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
+
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url)
@@ -1269,80 +1717,64 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
else:
self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- if playlist_id.startswith('RD'):
+ if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
- url = self._TEMPLATE_URL % playlist_id
- page = self._download_webpage(url, playlist_id)
- more_widget_html = content_html = page
-
- # Check if the playlist exists or is private
- if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
- raise ExtractorError(
- 'The playlist doesn\'t exist or is private, use --username or '
- '--netrc to access it.',
- expected=True)
-
- # Extract the video ids from the playlist pages
- ids = []
-
- for page_num in itertools.count(1):
- matches = re.finditer(self._VIDEO_RE, content_html)
- # We remove the duplicates and the link with index 0
- # (it's not the first video of the playlist)
- new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
- ids.extend(new_ids)
-
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
-
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
-
- playlist_title = self._html_search_regex(
- r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
- page, 'title')
-
- url_results = self._ids_to_results(ids)
- return self.playlist_result(url_results, playlist_id, playlist_title)
+ return self._extract_playlist(playlist_id)
-class YoutubeChannelIE(InfoExtractor):
+class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com channels'
_VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
+ _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
+ _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
IE_NAME = 'youtube:channel'
_TESTS = [{
'note': 'paginated channel',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'playlist_mincount': 91,
'info_dict': {
- 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'Uploads from lex will',
}
+ }, {
+ 'note': 'Age restricted channel',
+ # from https://www.youtube.com/user/DeusExOfficial
+ 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
+ 'playlist_mincount': 64,
+ 'info_dict': {
+ 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
+ 'title': 'Uploads from Deus Ex',
+ },
}]
- def extract_videos_from_page(self, page):
- ids_in_page = []
- for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(mobj.group(1))
- return ids_in_page
-
def _real_extract(self, url):
channel_id = self._match_id(url)
- video_ids = []
- url = 'https://www.youtube.com/channel/%s/videos' % channel_id
- channel_page = self._download_webpage(url, channel_id)
+ url = self._TEMPLATE_URL % channel_id
+
+ # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+ # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+ # otherwise fallback on channel by page extraction
+ channel_page = self._download_webpage(
+ url + '?view=57', channel_id,
+ 'Downloading channel page', fatal=False)
+ if channel_page is False:
+ channel_playlist_id = False
+ else:
+ channel_playlist_id = self._html_search_meta(
+ 'channelId', channel_page, 'channel id', default=None)
+ if not channel_playlist_id:
+ channel_playlist_id = self._search_regex(
+ r'data-(?:channel-external-|yt)id="([^"]+)"',
+ channel_page, 'channel id', default=None)
+ if channel_playlist_id and channel_playlist_id.startswith('UC'):
+ playlist_id = 'UU' + channel_playlist_id[2:]
+ return self.url_result(
+ compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
+ channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
autogenerated = re.search(r'''(?x)
class="[^"]*?(?:
channel-header-autogenerated-label|
@@ -1352,43 +1784,20 @@ class YoutubeChannelIE(InfoExtractor):
if autogenerated:
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
- video_ids = self.extract_videos_from_page(channel_page)
entries = [
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
+ self.url_result(
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
+ for video_id, video_title in self.extract_videos_from_page(channel_page)]
return self.playlist_result(entries, channel_id)
- def _entries():
- more_widget_html = content_html = channel_page
- for pagenum in itertools.count(1):
+ return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
- ids_in_page = self.extract_videos_from_page(content_html)
- for video_id in ids_in_page:
- yield self.url_result(
- video_id, 'Youtube', video_id=video_id)
- mobj = re.search(
- r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
- more_widget_html)
- if not mobj:
- break
-
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), channel_id,
- 'Downloading page #%s' % (pagenum + 1),
- transform_source=uppercase_escape)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
-
- return self.playlist_result(_entries(), channel_id)
-
-
-class YoutubeUserIE(InfoExtractor):
+class YoutubeUserIE(YoutubeChannelIE):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
- _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
- _GDATA_PAGE_SIZE = 50
- _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
+ _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
IE_NAME = 'youtube:user'
_TESTS = [{
@@ -1412,95 +1821,80 @@ class YoutubeUserIE(InfoExtractor):
else:
return super(YoutubeUserIE, cls).suitable(url)
- def _real_extract(self, url):
- username = self._match_id(url)
-
- # Download video ids using YouTube Data API. Result size per
- # query is limited (currently to 50 videos) so we need to query
- # page by page until there are no video ids - it means we got
- # all of them.
- def download_page(pagenum):
- start_index = pagenum * self._GDATA_PAGE_SIZE + 1
+class YoutubeUserPlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
+ IE_DESC = 'YouTube.com user playlists'
+ _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/user/(?P<id>[^/]+)/playlists'
+ IE_NAME = 'youtube:user:playlists'
- gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
- page = self._download_webpage(
- gdata_url, username,
- 'Downloading video ids from %d to %d' % (
- start_index, start_index + self._GDATA_PAGE_SIZE))
-
- try:
- response = json.loads(page)
- except ValueError as err:
- raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
- if 'entry' not in response['feed']:
- return
-
- # Extract video identifiers
- entries = response['feed']['entry']
- for entry in entries:
- title = entry['title']['$t']
- video_id = entry['id']['$t'].split('/')[-1]
- yield {
- '_type': 'url',
- 'url': video_id,
- 'ie_key': 'Youtube',
- 'id': video_id,
- 'title': title,
- }
- url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
-
- return self.playlist_result(url_results, playlist_title=username)
+ _TESTS = [{
+ 'url': 'http://www.youtube.com/user/ThirstForScience/playlists',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': 'ThirstForScience',
+ 'title': 'Thirst for Science',
+ },
+ }, {
+ # with "Load more" button
+ 'url': 'http://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ 'playlist_mincount': 70,
+ 'info_dict': {
+ 'id': 'igorkle1',
+ 'title': 'Игорь Клейнер',
+ },
+ }]
-class YoutubeSearchIE(SearchInfoExtractor):
+class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
IE_DESC = 'YouTube.com searches'
- _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
- _MAX_RESULTS = 1000
+ # there doesn't appear to be a real limit, for example if you search for
+ # 'python' you get more than 8.000.000 results
+ _MAX_RESULTS = float('inf')
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
+ _EXTRA_QUERY_ARGS = {}
+ _TESTS = []
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
- video_ids = []
- pagenum = 0
+ videos = []
limit = n
- PAGE_SIZE = 50
- while (PAGE_SIZE * pagenum) < limit:
- result_url = self._API_URL % (
- compat_urllib_parse.quote_plus(query.encode('utf-8')),
- (PAGE_SIZE * pagenum) + 1)
- data_json = self._download_webpage(
+ for pagenum in itertools.count(1):
+ url_query = {
+ 'search_query': query.encode('utf-8'),
+ 'page': pagenum,
+ 'spf': 'navigate',
+ }
+ url_query.update(self._EXTRA_QUERY_ARGS)
+ result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
+ data = self._download_json(
result_url, video_id='query "%s"' % query,
- note='Downloading page %s' % (pagenum + 1),
+ note='Downloading page %s' % pagenum,
errnote='Unable to download API page')
- data = json.loads(data_json)
- api_response = data['data']
+ html_content = data[1]['body']['content']
- if 'items' not in api_response:
+ if 'class="search-message' in html_content:
raise ExtractorError(
'[youtube] No video results', expected=True)
- new_ids = list(video['id'] for video in api_response['items'])
- video_ids += new_ids
-
- limit = min(n, api_response['totalItems'])
- pagenum += 1
+ new_videos = self._ids_to_results(orderedSet(re.findall(
+ r'href="/watch\?v=(.{11})', html_content)))
+ videos += new_videos
+ if not new_videos or len(videos) > limit:
+ break
- if len(video_ids) > n:
- video_ids = video_ids[:n]
- videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
+ if len(videos) > n:
+ videos = videos[:n]
return self.playlist_result(videos, query)
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
- _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = 'YouTube.com searches, newest videos first'
+ _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
class YoutubeSearchURLIE(InfoExtractor):
@@ -1517,14 +1911,14 @@ class YoutubeSearchURLIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+ query = compat_urllib_parse_unquote_plus(mobj.group('query'))
webpage = self._download_webpage(url, query)
result_code = self._search_regex(
- r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
+ r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
part_codes = re.findall(
- r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
+ r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
entries = []
for part_code in part_codes:
part_title = self._html_search_regex(
@@ -1546,13 +1940,13 @@ class YoutubeSearchURLIE(InfoExtractor):
}
-class YoutubeShowIE(InfoExtractor):
+class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
IE_DESC = 'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
IE_NAME = 'youtube:show'
_TESTS = [{
- 'url': 'http://www.youtube.com/show/airdisasters',
- 'playlist_mincount': 3,
+ 'url': 'https://www.youtube.com/show/airdisasters',
+ 'playlist_mincount': 5,
'info_dict': {
'id': 'airdisasters',
'title': 'Air Disasters',
@@ -1560,44 +1954,17 @@ class YoutubeShowIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
- webpage = self._download_webpage(
- url, playlist_id, 'Downloading show webpage')
- # There's one playlist for each season of the show
- m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
- self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
- entries = [
- self.url_result(
- 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
- for season in m_seasons
- ]
- title = self._og_search_title(webpage, fatal=False)
-
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': title,
- 'entries': entries,
- }
+ playlist_id = self._match_id(url)
+ return super(YoutubeShowIE, self)._real_extract(
+ 'https://www.youtube.com/show/%s/playlists' % playlist_id)
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
"""
- Base class for extractors that fetch info from
- http://www.youtube.com/feed_ajax
+ Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- # use action_load_personal_feed instead of action_load_system_feed
- _PERSONAL_FEED = False
-
- @property
- def _FEED_TEMPLATE(self):
- action = 'action_load_system_feed'
- if self._PERSONAL_FEED:
- action = 'action_load_personal_feed'
- return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
@property
def IE_NAME(self):
@@ -1607,51 +1974,49 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
self._login()
def _real_extract(self, url):
- feed_entries = []
- paging = 0
- for i in itertools.count(1):
- info = self._download_json(
- self._FEED_TEMPLATE % paging,
- '%s feed' % self._FEED_NAME,
- 'Downloading page %s' % i,
- transform_source=uppercase_escape)
- feed_html = info.get('feed_html') or info.get('content_html')
- load_more_widget_html = info.get('load_more_widget_html') or feed_html
- m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
- ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in ids)
- mobj = re.search(
- r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
- load_more_widget_html)
- if mobj is None:
+ page = self._download_webpage(
+ 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
+
+ # The extraction process is the same as for playlists, but the regex
+ # for the video ids doesn't contain an index
+ ids = []
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+ if not new_ids:
break
- paging = mobj.group('paging')
- return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
+ ids.extend(new_ids)
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
- _FEED_NAME = 'recommended'
- _PLAYLIST_TITLE = 'Youtube Recommended videos'
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
+
+ return self.playlist_result(
+ self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
-class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+ IE_NAME = 'youtube:watchlater'
IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
- _FEED_NAME = 'watch_later'
- _PLAYLIST_TITLE = 'Youtube Watch Later'
- _PERSONAL_FEED = True
+ _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+ _TESTS = [] # override PlaylistIE tests
-class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
- _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
- _FEED_NAME = 'history'
- _PERSONAL_FEED = True
- _PLAYLIST_TITLE = 'Youtube Watch History'
+ def _real_extract(self, url):
+ return self._extract_playlist('WL')
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
@@ -1666,42 +2031,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
return self.url_result(playlist_id, 'YoutubePlaylist')
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
- IE_NAME = 'youtube:subscriptions'
- IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- _TESTS = []
-
- def _real_extract(self, url):
- title = 'Youtube Subscriptions'
- page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
- # The extraction process is the same as for playlists, but the regex
- # for the video ids doesn't contain an index
- ids = []
- more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+ _FEED_NAME = 'recommended'
+ _PLAYLIST_TITLE = 'Youtube Recommended videos'
- for page_num in itertools.count(1):
- matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
- ids.extend(new_ids)
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ _FEED_NAME = 'subscriptions'
+ _PLAYLIST_TITLE = 'Youtube Subscriptions'
- more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), title,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
- content_html = more['content_html']
- more_widget_html = more['load_more_widget_html']
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': self._ids_to_results(ids),
- }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+ _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PLAYLIST_TITLE = 'Youtube History'
class YoutubeTruncatedURLIE(InfoExtractor):
@@ -1715,6 +2063,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):
annotation_id=annotation_[^&]+|
x-yt-cl=[0-9]+|
hl=[^&]*|
+ t=[0-9]+
)?
|
attribution_link\?a=[^&]+
@@ -1737,6 +2086,9 @@ class YoutubeTruncatedURLIE(InfoExtractor):
}, {
'url': 'https://www.youtube.com/watch?hl=en-GB',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?t=2372',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py
new file mode 100644
index 000000000..22a9a57e8
--- /dev/null
+++ b/youtube_dl/extractor/zapiks.py
@@ -0,0 +1,110 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_duration,
+ parse_iso8601,
+ xpath_with_ns,
+ xpath_text,
+ int_or_none,
+)
+
+
+class ZapiksIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))'
+ _TESTS = [
+ {
+ 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html',
+ 'md5': 'aeb3c473b2d564b2d46d664d28d5f050',
+ 'info_dict': {
+ 'id': '80798',
+ 'ext': 'mp4',
+ 'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!',
+ 'description': 'md5:7054d6f6f620c6519be1fe710d4da847',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 528,
+ 'timestamp': 1359044972,
+ 'upload_date': '20130124',
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ },
+ {
+ 'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'http://www.zapiks.fr/index.php?action=playerIframe&amp;media_id=118046&amp;width=640&amp;height=360&amp;autoStart=false&amp;language=fr',
+ 'only_matching': True,
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ if not video_id:
+ video_id = self._search_regex(
+ r'data-media-id="(\d+)"', webpage, 'video id')
+
+ playlist = self._download_xml(
+ 'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id,
+ display_id)
+
+ NS_MAP = {
+ 'jwplayer': 'http://rss.jwpcdn.com/'
+ }
+
+ def ns(path):
+ return xpath_with_ns(path, NS_MAP)
+
+ item = playlist.find('./channel/item')
+
+ title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage)
+ description = self._og_search_description(webpage, default=None)
+ thumbnail = xpath_text(
+ item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None)
+ duration = parse_duration(self._html_search_meta(
+ 'duration', webpage, 'duration', default=None))
+ timestamp = parse_iso8601(self._html_search_meta(
+ 'uploadDate', webpage, 'upload date', default=None), ' ')
+
+ view_count = int_or_none(self._search_regex(
+ r'UserPlays:(\d+)', webpage, 'view count', default=None))
+ comment_count = int_or_none(self._search_regex(
+ r'UserComments:(\d+)', webpage, 'comment count', default=None))
+
+ formats = []
+ for source in item.findall(ns('./jwplayer:source')):
+ format_id = source.attrib['label']
+ f = {
+ 'url': source.attrib['file'],
+ 'format_id': format_id,
+ }
+ m = re.search(r'^(?P<height>\d+)[pP]', format_id)
+ if m:
+ f['height'] = int(m.group('height'))
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 98f15177b..a795f56b3 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -9,6 +9,7 @@ from ..utils import (
int_or_none,
unified_strdate,
OnDemandPagedList,
+ xpath_text,
)
@@ -19,13 +20,11 @@ def extract_from_xml_url(ie, video_id, xml_url):
errnote='Failed to download video info')
title = doc.find('.//information/title').text
- description = doc.find('.//information/detail').text
- duration = int(doc.find('.//details/lengthSec').text)
- uploader_node = doc.find('.//details/originChannelTitle')
- uploader = None if uploader_node is None else uploader_node.text
- uploader_id_node = doc.find('.//details/originChannelId')
- uploader_id = None if uploader_id_node is None else uploader_id_node.text
- upload_date = unified_strdate(doc.find('.//details/airtime').text)
+ description = xpath_text(doc, './/information/detail', 'description')
+ duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration'))
+ uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader')
+ uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id')
+ upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date'))
def xml_to_format(fnode):
video_url = fnode.find('url').text
@@ -40,15 +39,14 @@ def extract_from_xml_url(ie, video_id, xml_url):
ext = format_m.group('container')
proto = format_m.group('proto').lower()
- quality = fnode.find('./quality').text
- abr = int(fnode.find('./audioBitrate').text) // 1000
- vbr_node = fnode.find('./videoBitrate')
- vbr = None if vbr_node is None else int(vbr_node.text) // 1000
+ quality = xpath_text(fnode, './quality', 'quality')
+ abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000)
+ vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000)
- width_node = fnode.find('./width')
- width = None if width_node is None else int_or_none(width_node.text)
- height_node = fnode.find('./height')
- height = None if height_node is None else int_or_none(height_node.text)
+ width = int_or_none(xpath_text(fnode, './width', 'width'))
+ height = int_or_none(xpath_text(fnode, './height', 'height'))
+
+ filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize'))
format_note = ''
if not format_note:
@@ -64,12 +62,31 @@ def extract_from_xml_url(ie, video_id, xml_url):
'vbr': vbr,
'width': width,
'height': height,
- 'filesize': int_or_none(fnode.find('./filesize').text),
+ 'filesize': filesize,
'format_note': format_note,
'protocol': proto,
'_available': is_available,
}
+ def xml_to_thumbnails(fnode):
+ thumbnails = []
+ for node in fnode:
+ thumbnail_url = node.text
+ if not thumbnail_url:
+ continue
+ thumbnail = {
+ 'url': thumbnail_url,
+ }
+ if 'key' in node.attrib:
+ m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key'])
+ if m:
+ thumbnail['width'] = int(m.group(1))
+ thumbnail['height'] = int(m.group(2))
+ thumbnails.append(thumbnail)
+ return thumbnails
+
+ thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage'))
+
format_nodes = doc.findall('.//formitaeten/formitaet')
formats = list(filter(
lambda f: f['_available'],
@@ -81,6 +98,7 @@ def extract_from_xml_url(ie, video_id, xml_url):
'title': title,
'description': description,
'duration': duration,
+ 'thumbnails': thumbnails,
'uploader': uploader,
'uploader_id': uploader_id,
'upload_date': upload_date,
diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py
index 1afbe68ed..437eecb67 100644
--- a/youtube_dl/extractor/zingmp3.py
+++ b/youtube_dl/extractor/zingmp3.py
@@ -4,12 +4,20 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import ExtractorError
class ZingMp3BaseInfoExtractor(InfoExtractor):
- @staticmethod
- def _extract_item(item):
+ def _extract_item(self, item, fatal=True):
+ error_message = item.find('./errormessage').text
+ if error_message:
+ if not fatal:
+ return
+ raise ExtractorError(
+ '%s returned error: %s' % (self.IE_NAME, error_message),
+ expected=True)
+
title = item.find('./title').text.strip()
source = item.find('./source').text
extension = item.attrib['type']
@@ -37,7 +45,9 @@ class ZingMp3BaseInfoExtractor(InfoExtractor):
entries = []
for i, item in enumerate(items, 1):
- entry = self._extract_item(item)
+ entry = self._extract_item(item, fatal=False)
+ if not entry:
+ continue
entry['id'] = '%s-%d' % (id, i)
entries.append(entry)
@@ -79,7 +89,7 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor):
class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor):
- _VALID_URL = r'https?://mp3\.zing\.vn/album/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html'
+ _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html'
_TESTS = [{
'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html',
'info_dict': {
@@ -88,6 +98,9 @@ class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor):
'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless',
},
'playlist_count': 10,
+ }, {
+ 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
+ 'only_matching': True,
}]
IE_NAME = 'zingmp3:album'
IE_DESC = 'mp3.zing.vn albums'