Merge remote-tracking branch 'origin/master'

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index bcf9272528e28a7051a4a9e6aa12f252fd2d1f89..672ef9eedb40b1f8aa7db86e0a20b591c88511f3 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -1,601 +1,4 @@
-import base64
-import datetime
-import itertools
-import netrc
-import os
-import re
-import socket
-import time
-import email.utils
-import xml.etree.ElementTree
-import random
-import math
-import operator
-import hashlib
-import binascii
-import urllib
+# Legacy file for backwards compatibility, use youtube_dl.extractor instead!
  
-from .utils import *
  from .extractor.common import InfoExtractor, SearchInfoExtractor
-
-from .extractor.ard import ARDIE
-from .extractor.arte import ArteTvIE
-from .extractor.bliptv import BlipTVIE, BlipTVUserIE
-from .extractor.comedycentral import ComedyCentralIE
-from .extractor.collegehumor import CollegeHumorIE
-from .extractor.dailymotion import DailymotionIE
-from .extractor.depositfiles import DepositFilesIE
-from .extractor.eighttracks import EightTracksIE
-from .extractor.escapist import EscapistIE
-from .extractor.facebook import FacebookIE
-from .extractor.funnyordie import FunnyOrDieIE
-from .extractor.gametrailers import GametrailersIE
-from .extractor.generic import GenericIE
-from .extractor.googleplus import GooglePlusIE
-from .extractor.googlesearch import GoogleSearchIE
-from .extractor.infoq import InfoQIE
-from .extractor.justintv import JustinTVIE
-from .extractor.keek import KeekIE
-from .extractor.liveleak import LiveLeakIE
-from .extractor.metacafe import MetacafeIE
-from .extractor.mixcloud import MixcloudIE
-from .extractor.mtv import MTVIE
-from .extractor.myspass import MySpassIE
-from .extractor.myvideo import MyVideoIE
-from .extractor.nba import NBAIE
-from .extractor.statigram import StatigramIE
-from .extractor.photobucket import PhotobucketIE
-from .extractor.pornotube import PornotubeIE
-from .extractor.rbmaradio import RBMARadioIE
-from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
-from .extractor.spiegel import SpiegelIE
-from .extractor.stanfordoc import StanfordOpenClassroomIE
-from .extractor.steam import SteamIE
-from .extractor.ted import TEDIE
-from .extractor.ustream import UstreamIE
-from .extractor.vimeo import VimeoIE
-from .extractor.worldstarhiphop import WorldStarHipHopIE
-from .extractor.xnxx import XNXXIE
-from .extractor.xvideos import XVideosIE
-from .extractor.yahoo import YahooIE, YahooSearchIE
-from .extractor.youjizz import YouJizzIE
-from .extractor.youku import YoukuIE
-from .extractor.youporn import YouPornIE
-from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
-from .extractor.zdf import ZDFIE
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-class TumblrIE(InfoExtractor):
-    _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
-
-    def _real_extract(self, url):
-        m_url = re.match(self._VALID_URL, url)
-        video_id = m_url.group('id')
-        blog = m_url.group('blog_name')
-
-        url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
-        webpage = self._download_webpage(url, video_id)
-
-        re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
-        video = re.search(re_video, webpage)
-        if video is None:
-           raise ExtractorError(u'Unable to extract video')
-        video_url = video.group('video_url')
-        ext = video.group('ext')
-
-        video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
-            webpage, u'thumbnail', fatal=False)  # We pick the first poster
-        if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
-
-        # The only place where you can get a title, it's not complete,
-        # but searching in other places doesn't work for all videos
-        video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
-            webpage, u'title', flags=re.DOTALL)
-
-        return [{'id': video_id,
-                 'url': video_url,
-                 'title': video_title,
-                 'thumbnail': video_thumbnail,
-                 'ext': ext
-                 }]
-
-class BandcampIE(InfoExtractor):
-    _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        title = mobj.group('title')
-        webpage = self._download_webpage(url, title)
-        # We get the link to the free download page
-        m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
-        if m_download is None:
-            raise ExtractorError(u'No free songs found')
-
-        download_link = m_download.group(1)
-        id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
-                       webpage, re.MULTILINE|re.DOTALL).group('id')
-
-        download_webpage = self._download_webpage(download_link, id,
-                                                  'Downloading free downloads page')
-        # We get the dictionary of the track from some javascrip code
-        info = re.search(r'items: (.*?),$',
-                         download_webpage, re.MULTILINE).group(1)
-        info = json.loads(info)[0]
-        # We pick mp3-320 for now, until format selection can be easily implemented.
-        mp3_info = info[u'downloads'][u'mp3-320']
-        # If we try to use this url it says the link has expired
-        initial_url = mp3_info[u'url']
-        re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
-        m_url = re.match(re_url, initial_url)
-        #We build the url we will use to get the final track url
-        # This url is build in Bandcamp in the script download_bunde_*.js
-        request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
-        final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
-        # If we could correctly generate the .rand field the url would be
-        #in the "download_url" key
-        final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
-
-        track_info = {'id':id,
-                      'title' : info[u'title'],
-                      'ext' :   'mp3',
-                      'url' :   final_url,
-                      'thumbnail' : info[u'thumb_url'],
-                      'uploader' :  info[u'artist']
-                      }
-
-        return [track_info]
-
-class RedTubeIE(InfoExtractor):
-    """Information Extractor for redtube"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
-
-    def _real_extract(self,url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        video_id = mobj.group('id')
-        video_extension = 'mp4'        
-        webpage = self._download_webpage(url, video_id)
-
-        self.report_extraction(video_id)
-
-        video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
-            webpage, u'video URL')
-
-        video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
-            webpage, u'title')
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      video_extension,
-            'title':    video_title,
-        }]
-        
-class InaIE(InfoExtractor):
-    """Information Extractor for Ina.fr"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
-
-    def _real_extract(self,url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
-        video_extension = 'mp4'
-        webpage = self._download_webpage(mrss_url, video_id)
-
-        self.report_extraction(video_id)
-
-        video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
-            webpage, u'video URL')
-
-        video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
-            webpage, u'title')
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      video_extension,
-            'title':    video_title,
-        }]
-
-class HowcastIE(InfoExtractor):
-    """Information Extractor for Howcast.com"""
-    _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        webpage_url = 'http://www.howcast.com/videos/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
-
-        self.report_extraction(video_id)
-
-        video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
-            webpage, u'video URL')
-
-        video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
-            webpage, u'title')
-
-        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
-            webpage, u'description', fatal=False)
-
-        thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
-            webpage, u'thumbnail', fatal=False)
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      'mp4',
-            'title':    video_title,
-            'description': video_description,
-            'thumbnail': thumbnail,
-        }]
-
-class VineIE(InfoExtractor):
-    """Information Extractor for Vine.co"""
-    _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        webpage_url = 'https://vine.co/v/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
-
-        self.report_extraction(video_id)
-
-        video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
-            webpage, u'video URL')
-
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
-            webpage, u'title')
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
-            webpage, u'thumbnail', fatal=False)
-
-        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
-            webpage, u'uploader', fatal=False, flags=re.DOTALL)
-
-        return [{
-            'id':        video_id,
-            'url':       video_url,
-            'ext':       'mp4',
-            'title':     video_title,
-            'thumbnail': thumbnail,
-            'uploader':  uploader,
-        }]
-
-class FlickrIE(InfoExtractor):
-    """Information Extractor for Flickr videos"""
-    _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        video_uploader_id = mobj.group('uploader_id')
-        webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
-
-        secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
-
-        first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
-        first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
-
-        node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
-            first_xml, u'node_id')
-
-        second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
-        second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
-
-        self.report_extraction(video_id)
-
-        mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
-
-        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'video title')
-
-        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'description', fatal=False)
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'thumbnail', fatal=False)
-
-        return [{
-            'id':          video_id,
-            'url':         video_url,
-            'ext':         'mp4',
-            'title':       video_title,
-            'description': video_description,
-            'thumbnail':   thumbnail,
-            'uploader_id': video_uploader_id,
-        }]
-
-class TeamcocoIE(InfoExtractor):
-    _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        url_title = mobj.group('url_title')
-        webpage = self._download_webpage(url, url_title)
-
-        video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
-            webpage, u'video id')
-
-        self.report_extraction(video_id)
-
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
-            webpage, u'title')
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
-            webpage, u'thumbnail', fatal=False)
-
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
-            webpage, u'description', fatal=False)
-
-        data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
-        data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
-
-        video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
-            data, u'video URL')
-
-        return [{
-            'id':          video_id,
-            'url':         video_url,
-            'ext':         'mp4',
-            'title':       video_title,
-            'thumbnail':   thumbnail,
-            'description': video_description,
-        }]
-
-class XHamsterIE(InfoExtractor):
-    """Information Extractor for xHamster"""
-    _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
-
-    def _real_extract(self,url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
-        webpage = self._download_webpage(mrss_url, video_id)
-
-        mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        if len(mobj.group('server')) == 0:
-            video_url = compat_urllib_parse.unquote(mobj.group('file'))
-        else:
-            video_url = mobj.group('server')+'/key='+mobj.group('file')
-        video_extension = video_url.split('.')[-1]
-
-        video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
-            webpage, u'title')
-
-        # Can't see the description anywhere in the UI
-        # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
-        #     webpage, u'description', fatal=False)
-        # if video_description: video_description = unescapeHTML(video_description)
-
-        mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
-        if mobj:
-            video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
-        else:
-            video_upload_date = None
-            self._downloader.report_warning(u'Unable to extract upload date')
-
-        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
-            webpage, u'uploader id', default=u'anonymous')
-
-        video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
-            webpage, u'thumbnail', fatal=False)
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      video_extension,
-            'title':    video_title,
-            # 'description': video_description,
-            'upload_date': video_upload_date,
-            'uploader_id': video_uploader_id,
-            'thumbnail': video_thumbnail
-        }]
-
-class HypemIE(InfoExtractor):
-    """Information Extractor for hypem"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        track_id = mobj.group(1)
-
-        data = { 'ax': 1, 'ts': time.time() }
-        data_encoded = compat_urllib_parse.urlencode(data)
-        complete_url = url + "?" + data_encoded
-        request = compat_urllib_request.Request(complete_url)
-        response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
-        cookie = urlh.headers.get('Set-Cookie', '')
-
-        self.report_extraction(track_id)
-
-        html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
-            response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
-        try:
-            track_list = json.loads(html_tracks)
-            track = track_list[u'tracks'][0]
-        except ValueError:
-            raise ExtractorError(u'Hypemachine contained invalid JSON.')
-
-        key = track[u"key"]
-        track_id = track[u"id"]
-        artist = track[u"artist"]
-        title = track[u"song"]
-
-        serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
-        request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
-        request.add_header('cookie', cookie)
-        song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
-        try:
-            song_data = json.loads(song_data_json)
-        except ValueError:
-            raise ExtractorError(u'Hypemachine contained invalid JSON.')
-        final_url = song_data[u"url"]
-
-        return [{
-            'id':       track_id,
-            'url':      final_url,
-            'ext':      "mp3",
-            'title':    title,
-            'artist':   artist,
-        }]
-
-class Vbox7IE(InfoExtractor):
-    """Information Extractor for Vbox7"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
-
-    def _real_extract(self,url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        video_id = mobj.group(1)
-
-        redirect_page, urlh = self._download_webpage_handle(url, video_id)
-        new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
-        redirect_url = urlh.geturl() + new_location
-        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
-
-        title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'title').split('/')[0].strip()
-
-        ext = "flv"
-        info_url = "http://vbox7.com/play/magare.do"
-        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
-        info_request = compat_urllib_request.Request(info_url, data)
-        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
-        if info_response is None:
-            raise ExtractorError(u'Unable to extract the media url')
-        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
-
-        return [{
-            'id':        video_id,
-            'url':       final_url,
-            'ext':       ext,
-            'title':     title,
-            'thumbnail': thumbnail_url,
-        }]
-
-
-def gen_extractors():
-    """ Return a list of an instance of every supported extractor.
-    The order does matter; the first extractor matched is the one handling the URL.
-    """
-    return [
-        YoutubePlaylistIE(),
-        YoutubeChannelIE(),
-        YoutubeUserIE(),
-        YoutubeSearchIE(),
-        YoutubeIE(),
-        MetacafeIE(),
-        DailymotionIE(),
-        GoogleSearchIE(),
-        PhotobucketIE(),
-        YahooIE(),
-        YahooSearchIE(),
-        DepositFilesIE(),
-        FacebookIE(),
-        BlipTVIE(),
-        BlipTVUserIE(),
-        VimeoIE(),
-        MyVideoIE(),
-        ComedyCentralIE(),
-        EscapistIE(),
-        CollegeHumorIE(),
-        XVideosIE(),
-        SoundcloudSetIE(),
-        SoundcloudIE(),
-        InfoQIE(),
-        MixcloudIE(),
-        StanfordOpenClassroomIE(),
-        MTVIE(),
-        YoukuIE(),
-        XNXXIE(),
-        YouJizzIE(),
-        PornotubeIE(),
-        YouPornIE(),
-        GooglePlusIE(),
-        ArteTvIE(),
-        NBAIE(),
-        WorldStarHipHopIE(),
-        JustinTVIE(),
-        FunnyOrDieIE(),
-        SteamIE(),
-        UstreamIE(),
-        RBMARadioIE(),
-        EightTracksIE(),
-        KeekIE(),
-        TEDIE(),
-        MySpassIE(),
-        SpiegelIE(),
-        LiveLeakIE(),
-        ARDIE(),
-        ZDFIE(),
-        TumblrIE(),
-        BandcampIE(),
-        RedTubeIE(),
-        InaIE(),
-        HowcastIE(),
-        VineIE(),
-        FlickrIE(),
-        TeamcocoIE(),
-        XHamsterIE(),
-        HypemIE(),
-        Vbox7IE(),
-        GametrailersIE(),
-        StatigramIE(),
-        GenericIE()
-    ]
-
-def get_info_extractor(ie_name):
-    """Returns the info extractor class with the given ie_name"""
-    return globals()[ie_name+'IE']
+from .extractor import gen_extractors, get_info_extractor