_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _closed_captions_xml_to_srt(self, xml_string):
 232         srt = ''
 233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 234         # TODO parse xml instead of regex
 235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 236             if not dur: dur = '4'
 237             start = float(start)
 238             end = start + float(dur)
 239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 241             caption = unescapeHTML(caption)
 242             caption = unescapeHTML(caption) # double cycle, intentional
 243             srt += str(n+1) + '\n'
 244             srt += start + ' --> ' + end + '\n'
 245             srt += caption + '\n\n'
 246         return srt
 247
 248     def _extract_subtitles(self, video_id):
 249         self.report_video_subtitles_download(video_id)
 250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 251         try:
 252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 257         if not srt_lang_list:
 258             return (u'WARNING: video has no closed captions', None)
 259         if self._downloader.params.get('subtitleslang', False):
 260             srt_lang = self._downloader.params.get('subtitleslang')
 261         elif 'en' in srt_lang_list:
 262             srt_lang = 'en'
 263         else:
 264             srt_lang = list(srt_lang_list.keys())[0]
 265         if not srt_lang in srt_lang_list:
 266             return (u'WARNING: no closed captions found in the specified language', None)
 267         params = compat_urllib_parse.urlencode({
 268             'lang': srt_lang,
 269             'name': srt_lang_list[srt_lang].encode('utf-8'),
 270             'v': video_id,
 271         })
 272         url = 'http://www.youtube.com/api/timedtext?' + params
 273         try:
 274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 277         if not srt_xml:
 278             return (u'WARNING: Did not fetch video subtitles', None)
 279         return (None, self._closed_captions_xml_to_srt(srt_xml))
 280
 281     def _print_formats(self, formats):
 282         print('Available formats:')
 283         for x in formats:
 284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 285
 286     def _real_initialize(self):
 287         if self._downloader is None:
 288             return
 289
 290         username = None
 291         password = None
 292         downloader_params = self._downloader.params
 293
 294         # Attempt to use provided username and password or .netrc data
 295         if downloader_params.get('username', None) is not None:
 296             username = downloader_params['username']
 297             password = downloader_params['password']
 298         elif downloader_params.get('usenetrc', False):
 299             try:
 300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 301                 if info is not None:
 302                     username = info[0]
 303                     password = info[2]
 304                 else:
 305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 306             except (IOError, netrc.NetrcParseError) as err:
 307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 308                 return
 309
 310         # Set language
 311         request = compat_urllib_request.Request(self._LANG_URL)
 312         try:
 313             self.report_lang()
 314             compat_urllib_request.urlopen(request).read()
 315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 317             return
 318
 319         # No authentication to be performed
 320         if username is None:
 321             return
 322
 323         # Log in
 324         login_form = {
 325                 'current_form': 'loginForm',
 326                 'next':     '/',
 327                 'action_login': 'Log In',
 328                 'username': username,
 329                 'password': password,
 330                 }
 331         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 332         try:
 333             self.report_login()
 334             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 335             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 336                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 337                 return
 338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 339             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 340             return
 341
 342         # Confirm age
 343         age_form = {
 344                 'next_url':     '/',
 345                 'action_confirm':   'Confirm',
 346                 }
 347         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 348         try:
 349             self.report_age_confirmation()
 350             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 351         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 352             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 353             return
 354
 355     def _extract_id(self, url):
 356         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 357         if mobj is None:
 358             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 359             return
 360         video_id = mobj.group(2)
 361         return video_id
 362
 363     def _real_extract(self, url):
 364         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 365         mobj = re.search(self._NEXT_URL_RE, url)
 366         if mobj:
 367             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 368         video_id = self._extract_id(url)
 369
 370         # Get video webpage
 371         self.report_video_webpage_download(video_id)
 372         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 373         request = compat_urllib_request.Request(url)
 374         try:
 375             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 377             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 378             return
 379
 380         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 381
 382         # Attempt to extract SWF player URL
 383         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 384         if mobj is not None:
 385             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 386         else:
 387             player_url = None
 388
 389         # Get video info
 390         self.report_video_info_webpage_download(video_id)
 391         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 392             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 393                     % (video_id, el_type))
 394             request = compat_urllib_request.Request(video_info_url)
 395             try:
 396                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 397                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 398                 video_info = compat_parse_qs(video_info_webpage)
 399                 if 'token' in video_info:
 400                     break
 401             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 402                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 403                 return
 404         if 'token' not in video_info:
 405             if 'reason' in video_info:
 406                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 407             else:
 408                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 409             return
 410
 411         # Check for "rental" videos
 412         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 413             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 414             return
 415
 416         # Start extracting information
 417         self.report_information_extraction(video_id)
 418
 419         # uploader
 420         if 'author' not in video_info:
 421             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 422             return
 423         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 424
 425         # uploader_id
 426         video_uploader_id = None
 427         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 428         if mobj is not None:
 429             video_uploader_id = mobj.group(1)
 430         else:
 431             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 432
 433         # title
 434         if 'title' not in video_info:
 435             self._downloader.trouble(u'ERROR: unable to extract video title')
 436             return
 437         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 438
 439         # thumbnail image
 440         if 'thumbnail_url' not in video_info:
 441             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 442             video_thumbnail = ''
 443         else:   # don't panic if we can't find it
 444             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 445
 446         # upload date
 447         upload_date = None
 448         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 449         if mobj is not None:
 450             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 451             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 452             for expression in format_expressions:
 453                 try:
 454                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 455                 except:
 456                     pass
 457
 458         # description
 459         video_description = get_element_by_id("eow-description", video_webpage)
 460         if video_description:
 461             video_description = clean_html(video_description)
 462         else:
 463             video_description = ''
 464
 465         # closed captions
 466         video_subtitles = None
 467         if self._downloader.params.get('writesubtitles', False):
 468             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 469             if srt_error:
 470                 self._downloader.trouble(srt_error)
 471
 472         if 'length_seconds' not in video_info:
 473             self._downloader.trouble(u'WARNING: unable to extract video duration')
 474             video_duration = ''
 475         else:
 476             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 477
 478         # token
 479         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 480
 481         # Decide which formats to download
 482         req_format = self._downloader.params.get('format', None)
 483
 484         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 485             self.report_rtmp_download()
 486             video_url_list = [(None, video_info['conn'][0])]
 487         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 488             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 489             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 490             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 491             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 492
 493             format_limit = self._downloader.params.get('format_limit', None)
 494             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 495             if format_limit is not None and format_limit in available_formats:
 496                 format_list = available_formats[available_formats.index(format_limit):]
 497             else:
 498                 format_list = available_formats
 499             existing_formats = [x for x in format_list if x in url_map]
 500             if len(existing_formats) == 0:
 501                 self._downloader.trouble(u'ERROR: no known formats available for video')
 502                 return
 503             if self._downloader.params.get('listformats', None):
 504                 self._print_formats(existing_formats)
 505                 return
 506             if req_format is None or req_format == 'best':
 507                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 508             elif req_format == 'worst':
 509                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 510             elif req_format in ('-1', 'all'):
 511                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 512             else:
 513                 # Specific formats. We pick the first in a slash-delimeted sequence.
 514                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 515                 req_formats = req_format.split('/')
 516                 video_url_list = None
 517                 for rf in req_formats:
 518                     if rf in url_map:
 519                         video_url_list = [(rf, url_map[rf])]
 520                         break
 521                 if video_url_list is None:
 522                     self._downloader.trouble(u'ERROR: requested format not available')
 523                     return
 524         else:
 525             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 526             return
 527
 528         results = []
 529         for format_param, video_real_url in video_url_list:
 530             # Extension
 531             video_extension = self._video_extensions.get(format_param, 'flv')
 532
 533             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 534                                               self._video_dimensions.get(format_param, '???'))
 535
 536             results.append({
 537                 'id':       video_id,
 538                 'url':      video_real_url,
 539                 'uploader': video_uploader,
 540                 'uploader_id': video_uploader_id,
 541                 'upload_date':  upload_date,
 542                 'title':    video_title,
 543                 'ext':      video_extension,
 544                 'format':   video_format,
 545                 'thumbnail':    video_thumbnail,
 546                 'description':  video_description,
 547                 'player_url':   player_url,
 548                 'subtitles':    video_subtitles,
 549                 'duration':     video_duration
 550             })
 551         return results
 552
 553
 554 class MetacafeIE(InfoExtractor):
 555     """Information Extractor for metacafe.com."""
 556
 557     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 558     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 559     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 560     IE_NAME = u'metacafe'
 561
 562     def __init__(self, downloader=None):
 563         InfoExtractor.__init__(self, downloader)
 564
 565     def report_disclaimer(self):
 566         """Report disclaimer retrieval."""
 567         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 568
 569     def report_age_confirmation(self):
 570         """Report attempt to confirm age."""
 571         self._downloader.to_screen(u'[metacafe] Confirming age')
 572
 573     def report_download_webpage(self, video_id):
 574         """Report webpage download."""
 575         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 576
 577     def report_extraction(self, video_id):
 578         """Report information extraction."""
 579         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 580
 581     def _real_initialize(self):
 582         # Retrieve disclaimer
 583         request = compat_urllib_request.Request(self._DISCLAIMER)
 584         try:
 585             self.report_disclaimer()
 586             disclaimer = compat_urllib_request.urlopen(request).read()
 587         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 588             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 589             return
 590
 591         # Confirm age
 592         disclaimer_form = {
 593             'filters': '0',
 594             'submit': "Continue - I'm over 18",
 595             }
 596         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 597         try:
 598             self.report_age_confirmation()
 599             disclaimer = compat_urllib_request.urlopen(request).read()
 600         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 601             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 602             return
 603
 604     def _real_extract(self, url):
 605         # Extract id and simplified title from URL
 606         mobj = re.match(self._VALID_URL, url)
 607         if mobj is None:
 608             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 609             return
 610
 611         video_id = mobj.group(1)
 612
 613         # Check if video comes from YouTube
 614         mobj2 = re.match(r'^yt-(.*)$', video_id)
 615         if mobj2 is not None:
 616             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 617             return
 618
 619         # Retrieve video webpage to extract further information
 620         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 621         try:
 622             self.report_download_webpage(video_id)
 623             webpage = compat_urllib_request.urlopen(request).read()
 624         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 625             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 626             return
 627
 628         # Extract URL, uploader and title from webpage
 629         self.report_extraction(video_id)
 630         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 631         if mobj is not None:
 632             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 633             video_extension = mediaURL[-3:]
 634
 635             # Extract gdaKey if available
 636             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 637             if mobj is None:
 638                 video_url = mediaURL
 639             else:
 640                 gdaKey = mobj.group(1)
 641                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 642         else:
 643             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 644             if mobj is None:
 645                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 646                 return
 647             vardict = compat_parse_qs(mobj.group(1))
 648             if 'mediaData' not in vardict:
 649                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 650                 return
 651             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 652             if mobj is None:
 653                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 654                 return
 655             mediaURL = mobj.group(1).replace('\\/', '/')
 656             video_extension = mediaURL[-3:]
 657             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 658
 659         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 660         if mobj is None:
 661             self._downloader.trouble(u'ERROR: unable to extract title')
 662             return
 663         video_title = mobj.group(1).decode('utf-8')
 664
 665         mobj = re.search(r'submitter=(.*?);', webpage)
 666         if mobj is None:
 667             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 668             return
 669         video_uploader = mobj.group(1)
 670
 671         return [{
 672             'id':       video_id.decode('utf-8'),
 673             'url':      video_url.decode('utf-8'),
 674             'uploader': video_uploader.decode('utf-8'),
 675             'upload_date':  None,
 676             'title':    video_title,
 677             'ext':      video_extension.decode('utf-8'),
 678         }]
 679
 680
 681 class DailymotionIE(InfoExtractor):
 682     """Information Extractor for Dailymotion"""
 683
 684     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 685     IE_NAME = u'dailymotion'
 686
 687     def __init__(self, downloader=None):
 688         InfoExtractor.__init__(self, downloader)
 689
 690     def report_extraction(self, video_id):
 691         """Report information extraction."""
 692         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 693
 694     def _real_extract(self, url):
 695         # Extract id and simplified title from URL
 696         mobj = re.match(self._VALID_URL, url)
 697         if mobj is None:
 698             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 699             return
 700
 701         video_id = mobj.group(1).split('_')[0].split('?')[0]
 702
 703         video_extension = 'mp4'
 704
 705         # Retrieve video webpage to extract further information
 706         request = compat_urllib_request.Request(url)
 707         request.add_header('Cookie', 'family_filter=off')
 708         webpage = self._download_webpage(request, video_id)
 709
 710         # Extract URL, uploader and title from webpage
 711         self.report_extraction(video_id)
 712         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 713         if mobj is None:
 714             self._downloader.trouble(u'ERROR: unable to extract media URL')
 715             return
 716         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 717
 718         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 719             if key in flashvars:
 720                 max_quality = key
 721                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 722                 break
 723         else:
 724             self._downloader.trouble(u'ERROR: unable to extract video URL')
 725             return
 726
 727         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 728         if mobj is None:
 729             self._downloader.trouble(u'ERROR: unable to extract video URL')
 730             return
 731
 732         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 733
 734         # TODO: support choosing qualities
 735
 736         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 737         if mobj is None:
 738             self._downloader.trouble(u'ERROR: unable to extract title')
 739             return
 740         video_title = unescapeHTML(mobj.group('title'))
 741
 742         video_uploader = None
 743         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 744         if mobj is None:
 745             # lookin for official user
 746             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 747             if mobj_official is None:
 748                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 749             else:
 750                 video_uploader = mobj_official.group(1)
 751         else:
 752             video_uploader = mobj.group(1)
 753
 754         video_upload_date = None
 755         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 756         if mobj is not None:
 757             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 758
 759         return [{
 760             'id':       video_id,
 761             'url':      video_url,
 762             'uploader': video_uploader,
 763             'upload_date':  video_upload_date,
 764             'title':    video_title,
 765             'ext':      video_extension,
 766         }]
 767
 768
 769 class PhotobucketIE(InfoExtractor):
 770     """Information extractor for photobucket.com."""
 771
 772     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 773     IE_NAME = u'photobucket'
 774
 775     def __init__(self, downloader=None):
 776         InfoExtractor.__init__(self, downloader)
 777
 778     def report_download_webpage(self, video_id):
 779         """Report webpage download."""
 780         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 781
 782     def report_extraction(self, video_id):
 783         """Report information extraction."""
 784         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 785
 786     def _real_extract(self, url):
 787         # Extract id from URL
 788         mobj = re.match(self._VALID_URL, url)
 789         if mobj is None:
 790             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 791             return
 792
 793         video_id = mobj.group(1)
 794
 795         video_extension = 'flv'
 796
 797         # Retrieve video webpage to extract further information
 798         request = compat_urllib_request.Request(url)
 799         try:
 800             self.report_download_webpage(video_id)
 801             webpage = compat_urllib_request.urlopen(request).read()
 802         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 803             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 804             return
 805
 806         # Extract URL, uploader, and title from webpage
 807         self.report_extraction(video_id)
 808         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 809         if mobj is None:
 810             self._downloader.trouble(u'ERROR: unable to extract media URL')
 811             return
 812         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 813
 814         video_url = mediaURL
 815
 816         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 817         if mobj is None:
 818             self._downloader.trouble(u'ERROR: unable to extract title')
 819             return
 820         video_title = mobj.group(1).decode('utf-8')
 821
 822         video_uploader = mobj.group(2).decode('utf-8')
 823
 824         return [{
 825             'id':       video_id.decode('utf-8'),
 826             'url':      video_url.decode('utf-8'),
 827             'uploader': video_uploader,
 828             'upload_date':  None,
 829             'title':    video_title,
 830             'ext':      video_extension.decode('utf-8'),
 831         }]
 832
 833
 834 class YahooIE(InfoExtractor):
 835     """Information extractor for video.yahoo.com."""
 836
 837     _WORKING = False
 838     # _VALID_URL matches all Yahoo! Video URLs
 839     # _VPAGE_URL matches only the extractable '/watch/' URLs
 840     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 841     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 842     IE_NAME = u'video.yahoo'
 843
 844     def __init__(self, downloader=None):
 845         InfoExtractor.__init__(self, downloader)
 846
 847     def report_download_webpage(self, video_id):
 848         """Report webpage download."""
 849         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 850
 851     def report_extraction(self, video_id):
 852         """Report information extraction."""
 853         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 854
 855     def _real_extract(self, url, new_video=True):
 856         # Extract ID from URL
 857         mobj = re.match(self._VALID_URL, url)
 858         if mobj is None:
 859             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 860             return
 861
 862         video_id = mobj.group(2)
 863         video_extension = 'flv'
 864
 865         # Rewrite valid but non-extractable URLs as
 866         # extractable English language /watch/ URLs
 867         if re.match(self._VPAGE_URL, url) is None:
 868             request = compat_urllib_request.Request(url)
 869             try:
 870                 webpage = compat_urllib_request.urlopen(request).read()
 871             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 872                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 873                 return
 874
 875             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 876             if mobj is None:
 877                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 878                 return
 879             yahoo_id = mobj.group(1)
 880
 881             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 882             if mobj is None:
 883                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 884                 return
 885             yahoo_vid = mobj.group(1)
 886
 887             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 888             return self._real_extract(url, new_video=False)
 889
 890         # Retrieve video webpage to extract further information
 891         request = compat_urllib_request.Request(url)
 892         try:
 893             self.report_download_webpage(video_id)
 894             webpage = compat_urllib_request.urlopen(request).read()
 895         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 896             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 897             return
 898
 899         # Extract uploader and title from webpage
 900         self.report_extraction(video_id)
 901         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 902         if mobj is None:
 903             self._downloader.trouble(u'ERROR: unable to extract video title')
 904             return
 905         video_title = mobj.group(1).decode('utf-8')
 906
 907         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 908         if mobj is None:
 909             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 910             return
 911         video_uploader = mobj.group(1).decode('utf-8')
 912
 913         # Extract video thumbnail
 914         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 915         if mobj is None:
 916             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 917             return
 918         video_thumbnail = mobj.group(1).decode('utf-8')
 919
 920         # Extract video description
 921         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 922         if mobj is None:
 923             self._downloader.trouble(u'ERROR: unable to extract video description')
 924             return
 925         video_description = mobj.group(1).decode('utf-8')
 926         if not video_description:
 927             video_description = 'No description available.'
 928
 929         # Extract video height and width
 930         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 931         if mobj is None:
 932             self._downloader.trouble(u'ERROR: unable to extract video height')
 933             return
 934         yv_video_height = mobj.group(1)
 935
 936         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 937         if mobj is None:
 938             self._downloader.trouble(u'ERROR: unable to extract video width')
 939             return
 940         yv_video_width = mobj.group(1)
 941
 942         # Retrieve video playlist to extract media URL
 943         # I'm not completely sure what all these options are, but we
 944         # seem to need most of them, otherwise the server sends a 401.
 945         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 946         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 947         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 948                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 949                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 950         try:
 951             self.report_download_webpage(video_id)
 952             webpage = compat_urllib_request.urlopen(request).read()
 953         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 954             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 955             return
 956
 957         # Extract media URL from playlist XML
 958         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 959         if mobj is None:
 960             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 961             return
 962         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 963         video_url = unescapeHTML(video_url)
 964
 965         return [{
 966             'id':       video_id.decode('utf-8'),
 967             'url':      video_url,
 968             'uploader': video_uploader,
 969             'upload_date':  None,
 970             'title':    video_title,
 971             'ext':      video_extension.decode('utf-8'),
 972             'thumbnail':    video_thumbnail.decode('utf-8'),
 973             'description':  video_description,
 974         }]
 975
 976
 977 class VimeoIE(InfoExtractor):
 978     """Information extractor for vimeo.com."""
 979
 980     # _VALID_URL matches Vimeo URLs
 981     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 982     IE_NAME = u'vimeo'
 983
 984     def __init__(self, downloader=None):
 985         InfoExtractor.__init__(self, downloader)
 986
 987     def report_download_webpage(self, video_id):
 988         """Report webpage download."""
 989         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 990
 991     def report_extraction(self, video_id):
 992         """Report information extraction."""
 993         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 994
 995     def _real_extract(self, url, new_video=True):
 996         # Extract ID from URL
 997         mobj = re.match(self._VALID_URL, url)
 998         if mobj is None:
 999             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1000             return
1001
1002         video_id = mobj.group(1)
1003
1004         # Retrieve video webpage to extract further information
1005         request = compat_urllib_request.Request(url, None, std_headers)
1006         try:
1007             self.report_download_webpage(video_id)
1008             webpage_bytes = compat_urllib_request.urlopen(request).read()
1009             webpage = webpage_bytes.decode('utf-8')
1010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1012             return
1013
1014         # Now we begin extracting as much information as we can from what we
1015         # retrieved. First we extract the information common to all extractors,
1016         # and latter we extract those that are Vimeo specific.
1017         self.report_extraction(video_id)
1018
1019         # Extract the config JSON
1020         try:
1021             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1022             config = json.loads(config)
1023         except:
1024             self._downloader.trouble(u'ERROR: unable to extract info section')
1025             return
1026
1027         # Extract title
1028         video_title = config["video"]["title"]
1029
1030         # Extract uploader and uploader_id
1031         video_uploader = config["video"]["owner"]["name"]
1032         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1033
1034         # Extract video thumbnail
1035         video_thumbnail = config["video"]["thumbnail"]
1036
1037         # Extract video description
1038         video_description = get_element_by_attribute("itemprop", "description", webpage)
1039         if video_description: video_description = clean_html(video_description)
1040         else: video_description = ''
1041
1042         # Extract upload date
1043         video_upload_date = None
1044         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1045         if mobj is not None:
1046             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1047
1048         # Vimeo specific: extract request signature and timestamp
1049         sig = config['request']['signature']
1050         timestamp = config['request']['timestamp']
1051
1052         # Vimeo specific: extract video codec and quality information
1053         # First consider quality, then codecs, then take everything
1054         # TODO bind to format param
1055         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1056         files = { 'hd': [], 'sd': [], 'other': []}
1057         for codec_name, codec_extension in codecs:
1058             if codec_name in config["video"]["files"]:
1059                 if 'hd' in config["video"]["files"][codec_name]:
1060                     files['hd'].append((codec_name, codec_extension, 'hd'))
1061                 elif 'sd' in config["video"]["files"][codec_name]:
1062                     files['sd'].append((codec_name, codec_extension, 'sd'))
1063                 else:
1064                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1065
1066         for quality in ('hd', 'sd', 'other'):
1067             if len(files[quality]) > 0:
1068                 video_quality = files[quality][0][2]
1069                 video_codec = files[quality][0][0]
1070                 video_extension = files[quality][0][1]
1071                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1072                 break
1073         else:
1074             self._downloader.trouble(u'ERROR: no known codec found')
1075             return
1076
1077         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1078                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1079
1080         return [{
1081             'id':       video_id,
1082             'url':      video_url,
1083             'uploader': video_uploader,
1084             'uploader_id': video_uploader_id,
1085             'upload_date':  video_upload_date,
1086             'title':    video_title,
1087             'ext':      video_extension,
1088             'thumbnail':    video_thumbnail,
1089             'description':  video_description,
1090         }]
1091
1092
1093 class ArteTvIE(InfoExtractor):
1094     """arte.tv information extractor."""
1095
1096     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1097     _LIVE_URL = r'index-[0-9]+\.html$'
1098
1099     IE_NAME = u'arte.tv'
1100
1101     def __init__(self, downloader=None):
1102         InfoExtractor.__init__(self, downloader)
1103
1104     def report_download_webpage(self, video_id):
1105         """Report webpage download."""
1106         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1107
1108     def report_extraction(self, video_id):
1109         """Report information extraction."""
1110         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1111
1112     def fetch_webpage(self, url):
1113         request = compat_urllib_request.Request(url)
1114         try:
1115             self.report_download_webpage(url)
1116             webpage = compat_urllib_request.urlopen(request).read()
1117         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1118             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1119             return
1120         except ValueError as err:
1121             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1122             return
1123         return webpage
1124
1125     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1126         page = self.fetch_webpage(url)
1127         mobj = re.search(regex, page, regexFlags)
1128         info = {}
1129
1130         if mobj is None:
1131             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1132             return
1133
1134         for (i, key, err) in matchTuples:
1135             if mobj.group(i) is None:
1136                 self._downloader.trouble(err)
1137                 return
1138             else:
1139                 info[key] = mobj.group(i)
1140
1141         return info
1142
1143     def extractLiveStream(self, url):
1144         video_lang = url.split('/')[-4]
1145         info = self.grep_webpage(
1146             url,
1147             r'src="(.*?/videothek_js.*?\.js)',
1148             0,
1149             [
1150                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1151             ]
1152         )
1153         http_host = url.split('/')[2]
1154         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1155         info = self.grep_webpage(
1156             next_url,
1157             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1158                 '(http://.*?\.swf).*?' +
1159                 '(rtmp://.*?)\'',
1160             re.DOTALL,
1161             [
1162                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1163                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1164                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1165             ]
1166         )
1167         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1168
1169     def extractPlus7Stream(self, url):
1170         video_lang = url.split('/')[-3]
1171         info = self.grep_webpage(
1172             url,
1173             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1174             0,
1175             [
1176                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1177             ]
1178         )
1179         next_url = compat_urllib_parse.unquote(info.get('url'))
1180         info = self.grep_webpage(
1181             next_url,
1182             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1183             0,
1184             [
1185                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1186             ]
1187         )
1188         next_url = compat_urllib_parse.unquote(info.get('url'))
1189
1190         info = self.grep_webpage(
1191             next_url,
1192             r'<video id="(.*?)".*?>.*?' +
1193                 '<name>(.*?)</name>.*?' +
1194                 '<dateVideo>(.*?)</dateVideo>.*?' +
1195                 '<url quality="hd">(.*?)</url>',
1196             re.DOTALL,
1197             [
1198                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1199                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1200                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1201                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1202             ]
1203         )
1204
1205         return {
1206             'id':           info.get('id'),
1207             'url':          compat_urllib_parse.unquote(info.get('url')),
1208             'uploader':     u'arte.tv',
1209             'upload_date':  info.get('date'),
1210             'title':        info.get('title').decode('utf-8'),
1211             'ext':          u'mp4',
1212             'format':       u'NA',
1213             'player_url':   None,
1214         }
1215
1216     def _real_extract(self, url):
1217         video_id = url.split('/')[-1]
1218         self.report_extraction(video_id)
1219
1220         if re.search(self._LIVE_URL, video_id) is not None:
1221             self.extractLiveStream(url)
1222             return
1223         else:
1224             info = self.extractPlus7Stream(url)
1225
1226         return [info]
1227
1228
1229 class GenericIE(InfoExtractor):
1230     """Generic last-resort information extractor."""
1231
1232     _VALID_URL = r'.*'
1233     IE_NAME = u'generic'
1234
1235     def __init__(self, downloader=None):
1236         InfoExtractor.__init__(self, downloader)
1237
1238     def report_download_webpage(self, video_id):
1239         """Report webpage download."""
1240         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1241         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1242
1243     def report_extraction(self, video_id):
1244         """Report information extraction."""
1245         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1246
1247     def report_following_redirect(self, new_url):
1248         """Report information extraction."""
1249         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1250
1251     def _test_redirect(self, url):
1252         """Check if it is a redirect, like url shorteners, in case restart chain."""
1253         class HeadRequest(compat_urllib_request.Request):
1254             def get_method(self):
1255                 return "HEAD"
1256
1257         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1258             """
1259             Subclass the HTTPRedirectHandler to make it use our
1260             HeadRequest also on the redirected URL
1261             """
1262             def redirect_request(self, req, fp, code, msg, headers, newurl):
1263                 if code in (301, 302, 303, 307):
1264                     newurl = newurl.replace(' ', '%20')
1265                     newheaders = dict((k,v) for k,v in req.headers.items()
1266                                       if k.lower() not in ("content-length", "content-type"))
1267                     return HeadRequest(newurl,
1268                                        headers=newheaders,
1269                                        origin_req_host=req.get_origin_req_host(),
1270                                        unverifiable=True)
1271                 else:
1272                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1273
1274         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1275             """
1276             Fallback to GET if HEAD is not allowed (405 HTTP error)
1277             """
1278             def http_error_405(self, req, fp, code, msg, headers):
1279                 fp.read()
1280                 fp.close()
1281
1282                 newheaders = dict((k,v) for k,v in req.headers.items()
1283                                   if k.lower() not in ("content-length", "content-type"))
1284                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1285                                                  headers=newheaders,
1286                                                  origin_req_host=req.get_origin_req_host(),
1287                                                  unverifiable=True))
1288
1289         # Build our opener
1290         opener = compat_urllib_request.OpenerDirector()
1291         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1292                         HTTPMethodFallback, HEADRedirectHandler,
1293                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1294             opener.add_handler(handler())
1295
1296         response = opener.open(HeadRequest(url))
1297         new_url = response.geturl()
1298
1299         if url == new_url:
1300             return False
1301
1302         self.report_following_redirect(new_url)
1303         self._downloader.download([new_url])
1304         return True
1305
1306     def _real_extract(self, url):
1307         if self._test_redirect(url): return
1308
1309         video_id = url.split('/')[-1]
1310         request = compat_urllib_request.Request(url)
1311         try:
1312             self.report_download_webpage(video_id)
1313             webpage = compat_urllib_request.urlopen(request).read()
1314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1315             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1316             return
1317         except ValueError as err:
1318             # since this is the last-resort InfoExtractor, if
1319             # this error is thrown, it'll be thrown here
1320             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1321             return
1322
1323         self.report_extraction(video_id)
1324         # Start with something easy: JW Player in SWFObject
1325         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1326         if mobj is None:
1327             # Broaden the search a little bit
1328             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1329         if mobj is None:
1330             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1331             return
1332
1333         # It's possible that one of the regexes
1334         # matched, but returned an empty group:
1335         if mobj.group(1) is None:
1336             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1337             return
1338
1339         video_url = compat_urllib_parse.unquote(mobj.group(1))
1340         video_id = os.path.basename(video_url)
1341
1342         # here's a fun little line of code for you:
1343         video_extension = os.path.splitext(video_id)[1][1:]
1344         video_id = os.path.splitext(video_id)[0]
1345
1346         # it's tempting to parse this further, but you would
1347         # have to take into account all the variations like
1348         #   Video Title - Site Name
1349         #   Site Name | Video Title
1350         #   Video Title - Tagline | Site Name
1351         # and so on and so forth; it's just not practical
1352         mobj = re.search(r'<title>(.*)</title>', webpage)
1353         if mobj is None:
1354             self._downloader.trouble(u'ERROR: unable to extract title')
1355             return
1356         video_title = mobj.group(1)
1357
1358         # video uploader is domain name
1359         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1360         if mobj is None:
1361             self._downloader.trouble(u'ERROR: unable to extract title')
1362             return
1363         video_uploader = mobj.group(1)
1364
1365         return [{
1366             'id':       video_id,
1367             'url':      video_url,
1368             'uploader': video_uploader,
1369             'upload_date':  None,
1370             'title':    video_title,
1371             'ext':      video_extension,
1372         }]
1373
1374
1375 class YoutubeSearchIE(InfoExtractor):
1376     """Information Extractor for YouTube search queries."""
1377     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1378     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1379     _max_youtube_results = 1000
1380     IE_NAME = u'youtube:search'
1381
1382     def __init__(self, downloader=None):
1383         InfoExtractor.__init__(self, downloader)
1384
1385     def report_download_page(self, query, pagenum):
1386         """Report attempt to download search page with given number."""
1387         query = query.decode(preferredencoding())
1388         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1389
1390     def _real_extract(self, query):
1391         mobj = re.match(self._VALID_URL, query)
1392         if mobj is None:
1393             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1394             return
1395
1396         prefix, query = query.split(':')
1397         prefix = prefix[8:]
1398         query = query.encode('utf-8')
1399         if prefix == '':
1400             self._download_n_results(query, 1)
1401             return
1402         elif prefix == 'all':
1403             self._download_n_results(query, self._max_youtube_results)
1404             return
1405         else:
1406             try:
1407                 n = int(prefix)
1408                 if n <= 0:
1409                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1410                     return
1411                 elif n > self._max_youtube_results:
1412                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1413                     n = self._max_youtube_results
1414                 self._download_n_results(query, n)
1415                 return
1416             except ValueError: # parsing prefix as integer fails
1417                 self._download_n_results(query, 1)
1418                 return
1419
1420     def _download_n_results(self, query, n):
1421         """Downloads a specified number of results for a query"""
1422
1423         video_ids = []
1424         pagenum = 0
1425         limit = n
1426
1427         while (50 * pagenum) < limit:
1428             self.report_download_page(query, pagenum+1)
1429             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1430             request = compat_urllib_request.Request(result_url)
1431             try:
1432                 data = compat_urllib_request.urlopen(request).read()
1433             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1434                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1435                 return
1436             api_response = json.loads(data)['data']
1437
1438             new_ids = list(video['id'] for video in api_response['items'])
1439             video_ids += new_ids
1440
1441             limit = min(n, api_response['totalItems'])
1442             pagenum += 1
1443
1444         if len(video_ids) > n:
1445             video_ids = video_ids[:n]
1446         for id in video_ids:
1447             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1448         return
1449
1450
1451 class GoogleSearchIE(InfoExtractor):
1452     """Information Extractor for Google Video search queries."""
1453     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1454     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1455     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1456     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1457     _max_google_results = 1000
1458     IE_NAME = u'video.google:search'
1459
1460     def __init__(self, downloader=None):
1461         InfoExtractor.__init__(self, downloader)
1462
1463     def report_download_page(self, query, pagenum):
1464         """Report attempt to download playlist page with given number."""
1465         query = query.decode(preferredencoding())
1466         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1467
1468     def _real_extract(self, query):
1469         mobj = re.match(self._VALID_URL, query)
1470         if mobj is None:
1471             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1472             return
1473
1474         prefix, query = query.split(':')
1475         prefix = prefix[8:]
1476         query = query.encode('utf-8')
1477         if prefix == '':
1478             self._download_n_results(query, 1)
1479             return
1480         elif prefix == 'all':
1481             self._download_n_results(query, self._max_google_results)
1482             return
1483         else:
1484             try:
1485                 n = int(prefix)
1486                 if n <= 0:
1487                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1488                     return
1489                 elif n > self._max_google_results:
1490                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1491                     n = self._max_google_results
1492                 self._download_n_results(query, n)
1493                 return
1494             except ValueError: # parsing prefix as integer fails
1495                 self._download_n_results(query, 1)
1496                 return
1497
1498     def _download_n_results(self, query, n):
1499         """Downloads a specified number of results for a query"""
1500
1501         video_ids = []
1502         pagenum = 0
1503
1504         while True:
1505             self.report_download_page(query, pagenum)
1506             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1507             request = compat_urllib_request.Request(result_url)
1508             try:
1509                 page = compat_urllib_request.urlopen(request).read()
1510             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1511                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1512                 return
1513
1514             # Extract video identifiers
1515             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1516                 video_id = mobj.group(1)
1517                 if video_id not in video_ids:
1518                     video_ids.append(video_id)
1519                     if len(video_ids) == n:
1520                         # Specified n videos reached
1521                         for id in video_ids:
1522                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1523                         return
1524
1525             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1526                 for id in video_ids:
1527                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1528                 return
1529
1530             pagenum = pagenum + 1
1531
1532
1533 class YahooSearchIE(InfoExtractor):
1534     """Information Extractor for Yahoo! Video search queries."""
1535
1536     _WORKING = False
1537     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1538     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1539     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1540     _MORE_PAGES_INDICATOR = r'\s*Next'
1541     _max_yahoo_results = 1000
1542     IE_NAME = u'video.yahoo:search'
1543
1544     def __init__(self, downloader=None):
1545         InfoExtractor.__init__(self, downloader)
1546
1547     def report_download_page(self, query, pagenum):
1548         """Report attempt to download playlist page with given number."""
1549         query = query.decode(preferredencoding())
1550         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1551
1552     def _real_extract(self, query):
1553         mobj = re.match(self._VALID_URL, query)
1554         if mobj is None:
1555             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1556             return
1557
1558         prefix, query = query.split(':')
1559         prefix = prefix[8:]
1560         query = query.encode('utf-8')
1561         if prefix == '':
1562             self._download_n_results(query, 1)
1563             return
1564         elif prefix == 'all':
1565             self._download_n_results(query, self._max_yahoo_results)
1566             return
1567         else:
1568             try:
1569                 n = int(prefix)
1570                 if n <= 0:
1571                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1572                     return
1573                 elif n > self._max_yahoo_results:
1574                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1575                     n = self._max_yahoo_results
1576                 self._download_n_results(query, n)
1577                 return
1578             except ValueError: # parsing prefix as integer fails
1579                 self._download_n_results(query, 1)
1580                 return
1581
1582     def _download_n_results(self, query, n):
1583         """Downloads a specified number of results for a query"""
1584
1585         video_ids = []
1586         already_seen = set()
1587         pagenum = 1
1588
1589         while True:
1590             self.report_download_page(query, pagenum)
1591             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1592             request = compat_urllib_request.Request(result_url)
1593             try:
1594                 page = compat_urllib_request.urlopen(request).read()
1595             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1596                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1597                 return
1598
1599             # Extract video identifiers
1600             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1601                 video_id = mobj.group(1)
1602                 if video_id not in already_seen:
1603                     video_ids.append(video_id)
1604                     already_seen.add(video_id)
1605                     if len(video_ids) == n:
1606                         # Specified n videos reached
1607                         for id in video_ids:
1608                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1609                         return
1610
1611             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1612                 for id in video_ids:
1613                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1614                 return
1615
1616             pagenum = pagenum + 1
1617
1618
1619 class YoutubePlaylistIE(InfoExtractor):
1620     """Information Extractor for YouTube playlists."""
1621
1622     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1623     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1624     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1625     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1626     IE_NAME = u'youtube:playlist'
1627
1628     def __init__(self, downloader=None):
1629         InfoExtractor.__init__(self, downloader)
1630
1631     def report_download_page(self, playlist_id, pagenum):
1632         """Report attempt to download playlist page with given number."""
1633         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1634
1635     def _real_extract(self, url):
1636         # Extract playlist id
1637         mobj = re.match(self._VALID_URL, url)
1638         if mobj is None:
1639             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1640             return
1641
1642         # Single video case
1643         if mobj.group(3) is not None:
1644             self._downloader.download([mobj.group(3)])
1645             return
1646
1647         # Download playlist pages
1648         # prefix is 'p' as default for playlists but there are other types that need extra care
1649         playlist_prefix = mobj.group(1)
1650         if playlist_prefix == 'a':
1651             playlist_access = 'artist'
1652         else:
1653             playlist_prefix = 'p'
1654             playlist_access = 'view_play_list'
1655         playlist_id = mobj.group(2)
1656         video_ids = []
1657         pagenum = 1
1658
1659         while True:
1660             self.report_download_page(playlist_id, pagenum)
1661             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1662             request = compat_urllib_request.Request(url)
1663             try:
1664                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1665             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1666                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1667                 return
1668
1669             # Extract video identifiers
1670             ids_in_page = []
1671             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1672                 if mobj.group(1) not in ids_in_page:
1673                     ids_in_page.append(mobj.group(1))
1674             video_ids.extend(ids_in_page)
1675
1676             if self._MORE_PAGES_INDICATOR not in page:
1677                 break
1678             pagenum = pagenum + 1
1679
1680         total = len(video_ids)
1681
1682         playliststart = self._downloader.params.get('playliststart', 1) - 1
1683         playlistend = self._downloader.params.get('playlistend', -1)
1684         if playlistend == -1:
1685             video_ids = video_ids[playliststart:]
1686         else:
1687             video_ids = video_ids[playliststart:playlistend]
1688
1689         if len(video_ids) == total:
1690             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1691         else:
1692             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1693
1694         for id in video_ids:
1695             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1696         return
1697
1698
1699 class YoutubeChannelIE(InfoExtractor):
1700     """Information Extractor for YouTube channels."""
1701
1702     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1703     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1704     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1705     IE_NAME = u'youtube:channel'
1706
1707     def report_download_page(self, channel_id, pagenum):
1708         """Report attempt to download channel page with given number."""
1709         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1710
1711     def _real_extract(self, url):
1712         # Extract channel id
1713         mobj = re.match(self._VALID_URL, url)
1714         if mobj is None:
1715             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1716             return
1717
1718         # Download channel pages
1719         channel_id = mobj.group(1)
1720         video_ids = []
1721         pagenum = 1
1722
1723         while True:
1724             self.report_download_page(channel_id, pagenum)
1725             url = self._TEMPLATE_URL % (channel_id, pagenum)
1726             request = compat_urllib_request.Request(url)
1727             try:
1728                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1729             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1730                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1731                 return
1732
1733             # Extract video identifiers
1734             ids_in_page = []
1735             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1736                 if mobj.group(1) not in ids_in_page:
1737                     ids_in_page.append(mobj.group(1))
1738             video_ids.extend(ids_in_page)
1739
1740             if self._MORE_PAGES_INDICATOR not in page:
1741                 break
1742             pagenum = pagenum + 1
1743
1744         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1745
1746         for id in video_ids:
1747             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1748         return
1749
1750
1751 class YoutubeUserIE(InfoExtractor):
1752     """Information Extractor for YouTube users."""
1753
1754     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1755     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1756     _GDATA_PAGE_SIZE = 50
1757     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1758     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1759     IE_NAME = u'youtube:user'
1760
1761     def __init__(self, downloader=None):
1762         InfoExtractor.__init__(self, downloader)
1763
1764     def report_download_page(self, username, start_index):
1765         """Report attempt to download user page."""
1766         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1767                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1768
1769     def _real_extract(self, url):
1770         # Extract username
1771         mobj = re.match(self._VALID_URL, url)
1772         if mobj is None:
1773             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1774             return
1775
1776         username = mobj.group(1)
1777
1778         # Download video ids using YouTube Data API. Result size per
1779         # query is limited (currently to 50 videos) so we need to query
1780         # page by page until there are no video ids - it means we got
1781         # all of them.
1782
1783         video_ids = []
1784         pagenum = 0
1785
1786         while True:
1787             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1788             self.report_download_page(username, start_index)
1789
1790             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1791
1792             try:
1793                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1794             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1796                 return
1797
1798             # Extract video identifiers
1799             ids_in_page = []
1800
1801             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1802                 if mobj.group(1) not in ids_in_page:
1803                     ids_in_page.append(mobj.group(1))
1804
1805             video_ids.extend(ids_in_page)
1806
1807             # A little optimization - if current page is not
1808             # "full", ie. does not contain PAGE_SIZE video ids then
1809             # we can assume that this page is the last one - there
1810             # are no more ids on further pages - no need to query
1811             # again.
1812
1813             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1814                 break
1815
1816             pagenum += 1
1817
1818         all_ids_count = len(video_ids)
1819         playliststart = self._downloader.params.get('playliststart', 1) - 1
1820         playlistend = self._downloader.params.get('playlistend', -1)
1821
1822         if playlistend == -1:
1823             video_ids = video_ids[playliststart:]
1824         else:
1825             video_ids = video_ids[playliststart:playlistend]
1826
1827         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1828                 (username, all_ids_count, len(video_ids)))
1829
1830         for video_id in video_ids:
1831             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1832
1833
1834 class BlipTVUserIE(InfoExtractor):
1835     """Information Extractor for blip.tv users."""
1836
1837     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1838     _PAGE_SIZE = 12
1839     IE_NAME = u'blip.tv:user'
1840
1841     def __init__(self, downloader=None):
1842         InfoExtractor.__init__(self, downloader)
1843
1844     def report_download_page(self, username, pagenum):
1845         """Report attempt to download user page."""
1846         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1847                 (self.IE_NAME, username, pagenum))
1848
1849     def _real_extract(self, url):
1850         # Extract username
1851         mobj = re.match(self._VALID_URL, url)
1852         if mobj is None:
1853             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1854             return
1855
1856         username = mobj.group(1)
1857
1858         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1859
1860         request = compat_urllib_request.Request(url)
1861
1862         try:
1863             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1864             mobj = re.search(r'data-users-id="([^"]+)"', page)
1865             page_base = page_base % mobj.group(1)
1866         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1867             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1868             return
1869
1870
1871         # Download video ids using BlipTV Ajax calls. Result size per
1872         # query is limited (currently to 12 videos) so we need to query
1873         # page by page until there are no video ids - it means we got
1874         # all of them.
1875
1876         video_ids = []
1877         pagenum = 1
1878
1879         while True:
1880             self.report_download_page(username, pagenum)
1881
1882             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1883
1884             try:
1885                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1886             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1887                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1888                 return
1889
1890             # Extract video identifiers
1891             ids_in_page = []
1892
1893             for mobj in re.finditer(r'href="/([^"]+)"', page):
1894                 if mobj.group(1) not in ids_in_page:
1895                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1896
1897             video_ids.extend(ids_in_page)
1898
1899             # A little optimization - if current page is not
1900             # "full", ie. does not contain PAGE_SIZE video ids then
1901             # we can assume that this page is the last one - there
1902             # are no more ids on further pages - no need to query
1903             # again.
1904
1905             if len(ids_in_page) < self._PAGE_SIZE:
1906                 break
1907
1908             pagenum += 1
1909
1910         all_ids_count = len(video_ids)
1911         playliststart = self._downloader.params.get('playliststart', 1) - 1
1912         playlistend = self._downloader.params.get('playlistend', -1)
1913
1914         if playlistend == -1:
1915             video_ids = video_ids[playliststart:]
1916         else:
1917             video_ids = video_ids[playliststart:playlistend]
1918
1919         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1920                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1921
1922         for video_id in video_ids:
1923             self._downloader.download([u'http://blip.tv/'+video_id])
1924
1925
1926 class DepositFilesIE(InfoExtractor):
1927     """Information extractor for depositfiles.com"""
1928
1929     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1930
1931     def report_download_webpage(self, file_id):
1932         """Report webpage download."""
1933         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1934
1935     def report_extraction(self, file_id):
1936         """Report information extraction."""
1937         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1938
1939     def _real_extract(self, url):
1940         file_id = url.split('/')[-1]
1941         # Rebuild url in english locale
1942         url = 'http://depositfiles.com/en/files/' + file_id
1943
1944         # Retrieve file webpage with 'Free download' button pressed
1945         free_download_indication = { 'gateway_result' : '1' }
1946         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1947         try:
1948             self.report_download_webpage(file_id)
1949             webpage = compat_urllib_request.urlopen(request).read()
1950         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1951             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1952             return
1953
1954         # Search for the real file URL
1955         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1956         if (mobj is None) or (mobj.group(1) is None):
1957             # Try to figure out reason of the error.
1958             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1959             if (mobj is not None) and (mobj.group(1) is not None):
1960                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1961                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1962             else:
1963                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1964             return
1965
1966         file_url = mobj.group(1)
1967         file_extension = os.path.splitext(file_url)[1][1:]
1968
1969         # Search for file title
1970         mobj = re.search(r'<b title="(.*?)">', webpage)
1971         if mobj is None:
1972             self._downloader.trouble(u'ERROR: unable to extract title')
1973             return
1974         file_title = mobj.group(1).decode('utf-8')
1975
1976         return [{
1977             'id':       file_id.decode('utf-8'),
1978             'url':      file_url.decode('utf-8'),
1979             'uploader': None,
1980             'upload_date':  None,
1981             'title':    file_title,
1982             'ext':      file_extension.decode('utf-8'),
1983         }]
1984
1985
1986 class FacebookIE(InfoExtractor):
1987     """Information Extractor for Facebook"""
1988
1989     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1990     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1991     _NETRC_MACHINE = 'facebook'
1992     IE_NAME = u'facebook'
1993
1994     def report_login(self):
1995         """Report attempt to log in."""
1996         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1997
1998     def _real_initialize(self):
1999         if self._downloader is None:
2000             return
2001
2002         useremail = None
2003         password = None
2004         downloader_params = self._downloader.params
2005
2006         # Attempt to use provided username and password or .netrc data
2007         if downloader_params.get('username', None) is not None:
2008             useremail = downloader_params['username']
2009             password = downloader_params['password']
2010         elif downloader_params.get('usenetrc', False):
2011             try:
2012                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2013                 if info is not None:
2014                     useremail = info[0]
2015                     password = info[2]
2016                 else:
2017                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2018             except (IOError, netrc.NetrcParseError) as err:
2019                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2020                 return
2021
2022         if useremail is None:
2023             return
2024
2025         # Log in
2026         login_form = {
2027             'email': useremail,
2028             'pass': password,
2029             'login': 'Log+In'
2030             }
2031         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2032         try:
2033             self.report_login()
2034             login_results = compat_urllib_request.urlopen(request).read()
2035             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2036                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2037                 return
2038         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2039             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2040             return
2041
2042     def _real_extract(self, url):
2043         mobj = re.match(self._VALID_URL, url)
2044         if mobj is None:
2045             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2046             return
2047         video_id = mobj.group('ID')
2048
2049         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2050         webpage = self._download_webpage(url, video_id)
2051
2052         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2053         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2054         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2055         if not m:
2056             raise ExtractorError(u'Cannot parse data')
2057         data = dict(json.loads(m.group(1)))
2058         params_raw = compat_urllib_parse.unquote(data['params'])
2059         params = json.loads(params_raw)
2060         video_url = params['hd_src']
2061         video_duration = int(params['video_duration'])
2062
2063         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2064         if not m:
2065             raise ExtractorError(u'Cannot find title in webpage')
2066         video_title = unescapeHTML(m.group(1))
2067
2068         info = {
2069             'id': video_id,
2070             'title': video_title,
2071             'url': video_url,
2072             'ext': 'mp4',
2073             'duration': video_duration,
2074             'thumbnail': params['thumbnail_src'],
2075         }
2076         return [info]
2077
2078
2079 class BlipTVIE(InfoExtractor):
2080     """Information extractor for blip.tv"""
2081
2082     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2083     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2084     IE_NAME = u'blip.tv'
2085
2086     def report_extraction(self, file_id):
2087         """Report information extraction."""
2088         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2089
2090     def report_direct_download(self, title):
2091         """Report information extraction."""
2092         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2093
2094     def _real_extract(self, url):
2095         mobj = re.match(self._VALID_URL, url)
2096         if mobj is None:
2097             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2098             return
2099
2100         if '?' in url:
2101             cchar = '&'
2102         else:
2103             cchar = '?'
2104         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2105         request = compat_urllib_request.Request(json_url)
2106         request.add_header('User-Agent', 'iTunes/10.6.1')
2107         self.report_extraction(mobj.group(1))
2108         info = None
2109         try:
2110             urlh = compat_urllib_request.urlopen(request)
2111             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2112                 basename = url.split('/')[-1]
2113                 title,ext = os.path.splitext(basename)
2114                 title = title.decode('UTF-8')
2115                 ext = ext.replace('.', '')
2116                 self.report_direct_download(title)
2117                 info = {
2118                     'id': title,
2119                     'url': url,
2120                     'uploader': None,
2121                     'upload_date': None,
2122                     'title': title,
2123                     'ext': ext,
2124                     'urlhandle': urlh
2125                 }
2126         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2127             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2128         if info is None: # Regular URL
2129             try:
2130                 json_code_bytes = urlh.read()
2131                 json_code = json_code_bytes.decode('utf-8')
2132             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2133                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2134                 return
2135
2136             try:
2137                 json_data = json.loads(json_code)
2138                 if 'Post' in json_data:
2139                     data = json_data['Post']
2140                 else:
2141                     data = json_data
2142
2143                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2144                 video_url = data['media']['url']
2145                 umobj = re.match(self._URL_EXT, video_url)
2146                 if umobj is None:
2147                     raise ValueError('Can not determine filename extension')
2148                 ext = umobj.group(1)
2149
2150                 info = {
2151                     'id': data['item_id'],
2152                     'url': video_url,
2153                     'uploader': data['display_name'],
2154                     'upload_date': upload_date,
2155                     'title': data['title'],
2156                     'ext': ext,
2157                     'format': data['media']['mimeType'],
2158                     'thumbnail': data['thumbnailUrl'],
2159                     'description': data['description'],
2160                     'player_url': data['embedUrl'],
2161                     'user_agent': 'iTunes/10.6.1',
2162                 }
2163             except (ValueError,KeyError) as err:
2164                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2165                 return
2166
2167         return [info]
2168
2169
2170 class MyVideoIE(InfoExtractor):
2171     """Information Extractor for myvideo.de."""
2172
2173     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2174     IE_NAME = u'myvideo'
2175
2176     def __init__(self, downloader=None):
2177         InfoExtractor.__init__(self, downloader)
2178
2179     def report_extraction(self, video_id):
2180         """Report information extraction."""
2181         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2182
2183     def _real_extract(self,url):
2184         mobj = re.match(self._VALID_URL, url)
2185         if mobj is None:
2186             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2187             return
2188
2189         video_id = mobj.group(1)
2190
2191         # Get video webpage
2192         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2193         webpage = self._download_webpage(webpage_url, video_id)
2194
2195         self.report_extraction(video_id)
2196         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2197                  webpage)
2198         if mobj is None:
2199             self._downloader.trouble(u'ERROR: unable to extract media URL')
2200             return
2201         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2202
2203         mobj = re.search('<title>([^<]+)</title>', webpage)
2204         if mobj is None:
2205             self._downloader.trouble(u'ERROR: unable to extract title')
2206             return
2207
2208         video_title = mobj.group(1)
2209
2210         return [{
2211             'id':       video_id,
2212             'url':      video_url,
2213             'uploader': None,
2214             'upload_date':  None,
2215             'title':    video_title,
2216             'ext':      u'flv',
2217         }]
2218
2219 class ComedyCentralIE(InfoExtractor):
2220     """Information extractor for The Daily Show and Colbert Report """
2221
2222     # urls can be abbreviations like :thedailyshow or :colbert
2223     # urls for episodes like:
2224     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2225     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2226     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2227     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2228                       |(https?://)?(www\.)?
2229                           (?P<showname>thedailyshow|colbertnation)\.com/
2230                          (full-episodes/(?P<episode>.*)|
2231                           (?P<clip>
2232                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2233                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2234                      $"""
2235
2236     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2237
2238     _video_extensions = {
2239         '3500': 'mp4',
2240         '2200': 'mp4',
2241         '1700': 'mp4',
2242         '1200': 'mp4',
2243         '750': 'mp4',
2244         '400': 'mp4',
2245     }
2246     _video_dimensions = {
2247         '3500': '1280x720',
2248         '2200': '960x540',
2249         '1700': '768x432',
2250         '1200': '640x360',
2251         '750': '512x288',
2252         '400': '384x216',
2253     }
2254
2255     def suitable(self, url):
2256         """Receives a URL and returns True if suitable for this IE."""
2257         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2258
2259     def report_extraction(self, episode_id):
2260         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2261
2262     def report_config_download(self, episode_id, media_id):
2263         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2264
2265     def report_index_download(self, episode_id):
2266         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2267
2268     def _print_formats(self, formats):
2269         print('Available formats:')
2270         for x in formats:
2271             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2272
2273
2274     def _real_extract(self, url):
2275         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2276         if mobj is None:
2277             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2278             return
2279
2280         if mobj.group('shortname'):
2281             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2282                 url = u'http://www.thedailyshow.com/full-episodes/'
2283             else:
2284                 url = u'http://www.colbertnation.com/full-episodes/'
2285             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2286             assert mobj is not None
2287
2288         if mobj.group('clip'):
2289             if mobj.group('showname') == 'thedailyshow':
2290                 epTitle = mobj.group('tdstitle')
2291             else:
2292                 epTitle = mobj.group('cntitle')
2293             dlNewest = False
2294         else:
2295             dlNewest = not mobj.group('episode')
2296             if dlNewest:
2297                 epTitle = mobj.group('showname')
2298             else:
2299                 epTitle = mobj.group('episode')
2300
2301         req = compat_urllib_request.Request(url)
2302         self.report_extraction(epTitle)
2303         try:
2304             htmlHandle = compat_urllib_request.urlopen(req)
2305             html = htmlHandle.read()
2306             webpage = html.decode('utf-8')
2307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2308             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2309             return
2310         if dlNewest:
2311             url = htmlHandle.geturl()
2312             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2313             if mobj is None:
2314                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2315                 return
2316             if mobj.group('episode') == '':
2317                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2318                 return
2319             epTitle = mobj.group('episode')
2320
2321         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2322
2323         if len(mMovieParams) == 0:
2324             # The Colbert Report embeds the information in a without
2325             # a URL prefix; so extract the alternate reference
2326             # and then add the URL prefix manually.
2327
2328             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2329             if len(altMovieParams) == 0:
2330                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2331                 return
2332             else:
2333                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2334
2335         uri = mMovieParams[0][1]
2336         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2337         self.report_index_download(epTitle)
2338         try:
2339             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2340         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2341             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2342             return
2343
2344         results = []
2345
2346         idoc = xml.etree.ElementTree.fromstring(indexXml)
2347         itemEls = idoc.findall('.//item')
2348         for partNum,itemEl in enumerate(itemEls):
2349             mediaId = itemEl.findall('./guid')[0].text
2350             shortMediaId = mediaId.split(':')[-1]
2351             showId = mediaId.split(':')[-2].replace('.com', '')
2352             officialTitle = itemEl.findall('./title')[0].text
2353             officialDate = itemEl.findall('./pubDate')[0].text
2354
2355             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2356                         compat_urllib_parse.urlencode({'uri': mediaId}))
2357             configReq = compat_urllib_request.Request(configUrl)
2358             self.report_config_download(epTitle, shortMediaId)
2359             try:
2360                 configXml = compat_urllib_request.urlopen(configReq).read()
2361             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2362                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2363                 return
2364
2365             cdoc = xml.etree.ElementTree.fromstring(configXml)
2366             turls = []
2367             for rendition in cdoc.findall('.//rendition'):
2368                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2369                 turls.append(finfo)
2370
2371             if len(turls) == 0:
2372                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2373                 continue
2374
2375             if self._downloader.params.get('listformats', None):
2376                 self._print_formats([i[0] for i in turls])
2377                 return
2378
2379             # For now, just pick the highest bitrate
2380             format,rtmp_video_url = turls[-1]
2381
2382             # Get the format arg from the arg stream
2383             req_format = self._downloader.params.get('format', None)
2384
2385             # Select format if we can find one
2386             for f,v in turls:
2387                 if f == req_format:
2388                     format, rtmp_video_url = f, v
2389                     break
2390
2391             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2392             if not m:
2393                 raise ExtractorError(u'Cannot transform RTMP url')
2394             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2395             video_url = base + m.group('finalid')
2396
2397             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2398             info = {
2399                 'id': shortMediaId,
2400                 'url': video_url,
2401                 'uploader': showId,
2402                 'upload_date': officialDate,
2403                 'title': effTitle,
2404                 'ext': 'mp4',
2405                 'format': format,
2406                 'thumbnail': None,
2407                 'description': officialTitle,
2408             }
2409             results.append(info)
2410
2411         return results
2412
2413
2414 class EscapistIE(InfoExtractor):
2415     """Information extractor for The Escapist """
2416
2417     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2418     IE_NAME = u'escapist'
2419
2420     def report_extraction(self, showName):
2421         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2422
2423     def report_config_download(self, showName):
2424         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2425
2426     def _real_extract(self, url):
2427         mobj = re.match(self._VALID_URL, url)
2428         if mobj is None:
2429             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2430             return
2431         showName = mobj.group('showname')
2432         videoId = mobj.group('episode')
2433
2434         self.report_extraction(showName)
2435         try:
2436             webPage = compat_urllib_request.urlopen(url)
2437             webPageBytes = webPage.read()
2438             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2439             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2440         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2442             return
2443
2444         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2445         description = unescapeHTML(descMatch.group(1))
2446         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2447         imgUrl = unescapeHTML(imgMatch.group(1))
2448         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2449         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2450         configUrlMatch = re.search('config=(.*)$', playerUrl)
2451         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2452
2453         self.report_config_download(showName)
2454         try:
2455             configJSON = compat_urllib_request.urlopen(configUrl)
2456             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2457             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2458         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2459             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2460             return
2461
2462         # Technically, it's JavaScript, not JSON
2463         configJSON = configJSON.replace("'", '"')
2464
2465         try:
2466             config = json.loads(configJSON)
2467         except (ValueError,) as err:
2468             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2469             return
2470
2471         playlist = config['playlist']
2472         videoUrl = playlist[1]['url']
2473
2474         info = {
2475             'id': videoId,
2476             'url': videoUrl,
2477             'uploader': showName,
2478             'upload_date': None,
2479             'title': showName,
2480             'ext': 'flv',
2481             'thumbnail': imgUrl,
2482             'description': description,
2483             'player_url': playerUrl,
2484         }
2485
2486         return [info]
2487
2488 class CollegeHumorIE(InfoExtractor):
2489     """Information extractor for collegehumor.com"""
2490
2491     _WORKING = False
2492     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2493     IE_NAME = u'collegehumor'
2494
2495     def report_manifest(self, video_id):
2496         """Report information extraction."""
2497         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2498
2499     def report_extraction(self, video_id):
2500         """Report information extraction."""
2501         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2502
2503     def _real_extract(self, url):
2504         mobj = re.match(self._VALID_URL, url)
2505         if mobj is None:
2506             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507             return
2508         video_id = mobj.group('videoid')
2509
2510         info = {
2511             'id': video_id,
2512             'uploader': None,
2513             'upload_date': None,
2514         }
2515
2516         self.report_extraction(video_id)
2517         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2518         try:
2519             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2520         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2521             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2522             return
2523
2524         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2525         try:
2526             videoNode = mdoc.findall('./video')[0]
2527             info['description'] = videoNode.findall('./description')[0].text
2528             info['title'] = videoNode.findall('./caption')[0].text
2529             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2530             manifest_url = videoNode.findall('./file')[0].text
2531         except IndexError:
2532             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2533             return
2534
2535         manifest_url += '?hdcore=2.10.3'
2536         self.report_manifest(video_id)
2537         try:
2538             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2539         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2540             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2541             return
2542
2543         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2544         try:
2545             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2546             node_id = media_node.attrib['url']
2547             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2548         except IndexError as err:
2549             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2550             return
2551
2552         url_pr = compat_urllib_parse_urlparse(manifest_url)
2553         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2554
2555         info['url'] = url
2556         info['ext'] = 'f4f'
2557         return [info]
2558
2559
2560 class XVideosIE(InfoExtractor):
2561     """Information extractor for xvideos.com"""
2562
2563     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2564     IE_NAME = u'xvideos'
2565
2566     def report_extraction(self, video_id):
2567         """Report information extraction."""
2568         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2569
2570     def _real_extract(self, url):
2571         mobj = re.match(self._VALID_URL, url)
2572         if mobj is None:
2573             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2574             return
2575         video_id = mobj.group(1)
2576
2577         webpage = self._download_webpage(url, video_id)
2578
2579         self.report_extraction(video_id)
2580
2581
2582         # Extract video URL
2583         mobj = re.search(r'flv_url=(.+?)&', webpage)
2584         if mobj is None:
2585             self._downloader.trouble(u'ERROR: unable to extract video url')
2586             return
2587         video_url = compat_urllib_parse.unquote(mobj.group(1))
2588
2589
2590         # Extract title
2591         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2592         if mobj is None:
2593             self._downloader.trouble(u'ERROR: unable to extract video title')
2594             return
2595         video_title = mobj.group(1)
2596
2597
2598         # Extract video thumbnail
2599         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2600         if mobj is None:
2601             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2602             return
2603         video_thumbnail = mobj.group(0)
2604
2605         info = {
2606             'id': video_id,
2607             'url': video_url,
2608             'uploader': None,
2609             'upload_date': None,
2610             'title': video_title,
2611             'ext': 'flv',
2612             'thumbnail': video_thumbnail,
2613             'description': None,
2614         }
2615
2616         return [info]
2617
2618
2619 class SoundcloudIE(InfoExtractor):
2620     """Information extractor for soundcloud.com
2621        To access the media, the uid of the song and a stream token
2622        must be extracted from the page source and the script must make
2623        a request to media.soundcloud.com/crossdomain.xml. Then
2624        the media can be grabbed by requesting from an url composed
2625        of the stream token and uid
2626      """
2627
2628     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2629     IE_NAME = u'soundcloud'
2630
2631     def __init__(self, downloader=None):
2632         InfoExtractor.__init__(self, downloader)
2633
2634     def report_resolve(self, video_id):
2635         """Report information extraction."""
2636         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2637
2638     def report_extraction(self, video_id):
2639         """Report information extraction."""
2640         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2641
2642     def _real_extract(self, url):
2643         mobj = re.match(self._VALID_URL, url)
2644         if mobj is None:
2645             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2646             return
2647
2648         # extract uploader (which is in the url)
2649         uploader = mobj.group(1)
2650         # extract simple title (uploader + slug of song title)
2651         slug_title =  mobj.group(2)
2652         simple_title = uploader + u'-' + slug_title
2653
2654         self.report_resolve('%s/%s' % (uploader, slug_title))
2655
2656         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2657         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2658         request = compat_urllib_request.Request(resolv_url)
2659         try:
2660             info_json_bytes = compat_urllib_request.urlopen(request).read()
2661             info_json = info_json_bytes.decode('utf-8')
2662         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2664             return
2665
2666         info = json.loads(info_json)
2667         video_id = info['id']
2668         self.report_extraction('%s/%s' % (uploader, slug_title))
2669
2670         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2671         request = compat_urllib_request.Request(streams_url)
2672         try:
2673             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2674             stream_json = stream_json_bytes.decode('utf-8')
2675         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2677             return
2678
2679         streams = json.loads(stream_json)
2680         mediaURL = streams['http_mp3_128_url']
2681
2682         return [{
2683             'id':       info['id'],
2684             'url':      mediaURL,
2685             'uploader': info['user']['username'],
2686             'upload_date':  info['created_at'],
2687             'title':    info['title'],
2688             'ext':      u'mp3',
2689             'description': info['description'],
2690         }]
2691
2692
2693 class InfoQIE(InfoExtractor):
2694     """Information extractor for infoq.com"""
2695     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2696
2697     def report_extraction(self, video_id):
2698         """Report information extraction."""
2699         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2700
2701     def _real_extract(self, url):
2702         mobj = re.match(self._VALID_URL, url)
2703         if mobj is None:
2704             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2705             return
2706
2707         webpage = self._download_webpage(url, video_id=url)
2708         self.report_extraction(url)
2709
2710         # Extract video URL
2711         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2712         if mobj is None:
2713             self._downloader.trouble(u'ERROR: unable to extract video url')
2714             return
2715         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2716         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2717
2718         # Extract title
2719         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2720         if mobj is None:
2721             self._downloader.trouble(u'ERROR: unable to extract video title')
2722             return
2723         video_title = mobj.group(1)
2724
2725         # Extract description
2726         video_description = u'No description available.'
2727         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2728         if mobj is not None:
2729             video_description = mobj.group(1)
2730
2731         video_filename = video_url.split('/')[-1]
2732         video_id, extension = video_filename.split('.')
2733
2734         info = {
2735             'id': video_id,
2736             'url': video_url,
2737             'uploader': None,
2738             'upload_date': None,
2739             'title': video_title,
2740             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2741             'thumbnail': None,
2742             'description': video_description,
2743         }
2744
2745         return [info]
2746
2747 class MixcloudIE(InfoExtractor):
2748     """Information extractor for www.mixcloud.com"""
2749
2750     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2751     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752     IE_NAME = u'mixcloud'
2753
2754     def __init__(self, downloader=None):
2755         InfoExtractor.__init__(self, downloader)
2756
2757     def report_download_json(self, file_id):
2758         """Report JSON download."""
2759         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2760
2761     def report_extraction(self, file_id):
2762         """Report information extraction."""
2763         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2764
2765     def get_urls(self, jsonData, fmt, bitrate='best'):
2766         """Get urls from 'audio_formats' section in json"""
2767         file_url = None
2768         try:
2769             bitrate_list = jsonData[fmt]
2770             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2771                 bitrate = max(bitrate_list) # select highest
2772
2773             url_list = jsonData[fmt][bitrate]
2774         except TypeError: # we have no bitrate info.
2775             url_list = jsonData[fmt]
2776         return url_list
2777
2778     def check_urls(self, url_list):
2779         """Returns 1st active url from list"""
2780         for url in url_list:
2781             try:
2782                 compat_urllib_request.urlopen(url)
2783                 return url
2784             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785                 url = None
2786
2787         return None
2788
2789     def _print_formats(self, formats):
2790         print('Available formats:')
2791         for fmt in formats.keys():
2792             for b in formats[fmt]:
2793                 try:
2794                     ext = formats[fmt][b][0]
2795                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2796                 except TypeError: # we have no bitrate info
2797                     ext = formats[fmt][0]
2798                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2799                     break
2800
2801     def _real_extract(self, url):
2802         mobj = re.match(self._VALID_URL, url)
2803         if mobj is None:
2804             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2805             return
2806         # extract uploader & filename from url
2807         uploader = mobj.group(1).decode('utf-8')
2808         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2809
2810         # construct API request
2811         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2812         # retrieve .json file with links to files
2813         request = compat_urllib_request.Request(file_url)
2814         try:
2815             self.report_download_json(file_url)
2816             jsonData = compat_urllib_request.urlopen(request).read()
2817         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2818             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2819             return
2820
2821         # parse JSON
2822         json_data = json.loads(jsonData)
2823         player_url = json_data['player_swf_url']
2824         formats = dict(json_data['audio_formats'])
2825
2826         req_format = self._downloader.params.get('format', None)
2827         bitrate = None
2828
2829         if self._downloader.params.get('listformats', None):
2830             self._print_formats(formats)
2831             return
2832
2833         if req_format is None or req_format == 'best':
2834             for format_param in formats.keys():
2835                 url_list = self.get_urls(formats, format_param)
2836                 # check urls
2837                 file_url = self.check_urls(url_list)
2838                 if file_url is not None:
2839                     break # got it!
2840         else:
2841             if req_format not in formats:
2842                 self._downloader.trouble(u'ERROR: format is not available')
2843                 return
2844
2845             url_list = self.get_urls(formats, req_format)
2846             file_url = self.check_urls(url_list)
2847             format_param = req_format
2848
2849         return [{
2850             'id': file_id.decode('utf-8'),
2851             'url': file_url.decode('utf-8'),
2852             'uploader': uploader.decode('utf-8'),
2853             'upload_date': None,
2854             'title': json_data['name'],
2855             'ext': file_url.split('.')[-1].decode('utf-8'),
2856             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2857             'thumbnail': json_data['thumbnail_url'],
2858             'description': json_data['description'],
2859             'player_url': player_url.decode('utf-8'),
2860         }]
2861
2862 class StanfordOpenClassroomIE(InfoExtractor):
2863     """Information extractor for Stanford's Open ClassRoom"""
2864
2865     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2866     IE_NAME = u'stanfordoc'
2867
2868     def report_download_webpage(self, objid):
2869         """Report information extraction."""
2870         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2871
2872     def report_extraction(self, video_id):
2873         """Report information extraction."""
2874         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2875
2876     def _real_extract(self, url):
2877         mobj = re.match(self._VALID_URL, url)
2878         if mobj is None:
2879             raise ExtractorError(u'Invalid URL: %s' % url)
2880
2881         if mobj.group('course') and mobj.group('video'): # A specific video
2882             course = mobj.group('course')
2883             video = mobj.group('video')
2884             info = {
2885                 'id': course + '_' + video,
2886                 'uploader': None,
2887                 'upload_date': None,
2888             }
2889
2890             self.report_extraction(info['id'])
2891             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2892             xmlUrl = baseUrl + video + '.xml'
2893             try:
2894                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2895             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2897                 return
2898             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2899             try:
2900                 info['title'] = mdoc.findall('./title')[0].text
2901                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2902             except IndexError:
2903                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2904                 return
2905             info['ext'] = info['url'].rpartition('.')[2]
2906             return [info]
2907         elif mobj.group('course'): # A course page
2908             course = mobj.group('course')
2909             info = {
2910                 'id': course,
2911                 'type': 'playlist',
2912                 'uploader': None,
2913                 'upload_date': None,
2914             }
2915
2916             coursepage = self._download_webpage(url, info['id'],
2917                                         note='Downloading course info page',
2918                                         errnote='Unable to download course info page')
2919
2920             m = re.search('<h1>([^<]+)</h1>', coursepage)
2921             if m:
2922                 info['title'] = unescapeHTML(m.group(1))
2923             else:
2924                 info['title'] = info['id']
2925
2926             m = re.search('<description>([^<]+)</description>', coursepage)
2927             if m:
2928                 info['description'] = unescapeHTML(m.group(1))
2929
2930             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2931             info['list'] = [
2932                 {
2933                     'type': 'reference',
2934                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2935                 }
2936                     for vpage in links]
2937             results = []
2938             for entry in info['list']:
2939                 assert entry['type'] == 'reference'
2940                 results += self.extract(entry['url'])
2941             return results
2942         else: # Root page
2943             info = {
2944                 'id': 'Stanford OpenClassroom',
2945                 'type': 'playlist',
2946                 'uploader': None,
2947                 'upload_date': None,
2948             }
2949
2950             self.report_download_webpage(info['id'])
2951             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2952             try:
2953                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2954             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2955                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2956                 return
2957
2958             info['title'] = info['id']
2959
2960             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2961             info['list'] = [
2962                 {
2963                     'type': 'reference',
2964                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2965                 }
2966                     for cpage in links]
2967
2968             results = []
2969             for entry in info['list']:
2970                 assert entry['type'] == 'reference'
2971                 results += self.extract(entry['url'])
2972             return results
2973
2974 class MTVIE(InfoExtractor):
2975     """Information extractor for MTV.com"""
2976
2977     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2978     IE_NAME = u'mtv'
2979
2980     def report_extraction(self, video_id):
2981         """Report information extraction."""
2982         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2983
2984     def _real_extract(self, url):
2985         mobj = re.match(self._VALID_URL, url)
2986         if mobj is None:
2987             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2988             return
2989         if not mobj.group('proto'):
2990             url = 'http://' + url
2991         video_id = mobj.group('videoid')
2992
2993         webpage = self._download_webpage(url, video_id)
2994
2995         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2996         if mobj is None:
2997             self._downloader.trouble(u'ERROR: unable to extract song name')
2998             return
2999         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3000         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3001         if mobj is None:
3002             self._downloader.trouble(u'ERROR: unable to extract performer')
3003             return
3004         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3005         video_title = performer + ' - ' + song_name
3006
3007         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3008         if mobj is None:
3009             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3010             return
3011         mtvn_uri = mobj.group(1)
3012
3013         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3014         if mobj is None:
3015             self._downloader.trouble(u'ERROR: unable to extract content id')
3016             return
3017         content_id = mobj.group(1)
3018
3019         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3020         self.report_extraction(video_id)
3021         request = compat_urllib_request.Request(videogen_url)
3022         try:
3023             metadataXml = compat_urllib_request.urlopen(request).read()
3024         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3026             return
3027
3028         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3029         renditions = mdoc.findall('.//rendition')
3030
3031         # For now, always pick the highest quality.
3032         rendition = renditions[-1]
3033
3034         try:
3035             _,_,ext = rendition.attrib['type'].partition('/')
3036             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3037             video_url = rendition.find('./src').text
3038         except KeyError:
3039             self._downloader.trouble('Invalid rendition field.')
3040             return
3041
3042         info = {
3043             'id': video_id,
3044             'url': video_url,
3045             'uploader': performer,
3046             'upload_date': None,
3047             'title': video_title,
3048             'ext': ext,
3049             'format': format,
3050         }
3051
3052         return [info]
3053
3054
3055 class YoukuIE(InfoExtractor):
3056     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3057
3058     def report_download_webpage(self, file_id):
3059         """Report webpage download."""
3060         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3061
3062     def report_extraction(self, file_id):
3063         """Report information extraction."""
3064         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3065
3066     def _gen_sid(self):
3067         nowTime = int(time.time() * 1000)
3068         random1 = random.randint(1000,1998)
3069         random2 = random.randint(1000,9999)
3070
3071         return "%d%d%d" %(nowTime,random1,random2)
3072
3073     def _get_file_ID_mix_string(self, seed):
3074         mixed = []
3075         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3076         seed = float(seed)
3077         for i in range(len(source)):
3078             seed  =  (seed * 211 + 30031 ) % 65536
3079             index  =  math.floor(seed / 65536 * len(source) )
3080             mixed.append(source[int(index)])
3081             source.remove(source[int(index)])
3082         #return ''.join(mixed)
3083         return mixed
3084
3085     def _get_file_id(self, fileId, seed):
3086         mixed = self._get_file_ID_mix_string(seed)
3087         ids = fileId.split('*')
3088         realId = []
3089         for ch in ids:
3090             if ch:
3091                 realId.append(mixed[int(ch)])
3092         return ''.join(realId)
3093
3094     def _real_extract(self, url):
3095         mobj = re.match(self._VALID_URL, url)
3096         if mobj is None:
3097             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3098             return
3099         video_id = mobj.group('ID')
3100
3101         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3102
3103         request = compat_urllib_request.Request(info_url, None, std_headers)
3104         try:
3105             self.report_download_webpage(video_id)
3106             jsondata = compat_urllib_request.urlopen(request).read()
3107         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3108             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3109             return
3110
3111         self.report_extraction(video_id)
3112         try:
3113             jsonstr = jsondata.decode('utf-8')
3114             config = json.loads(jsonstr)
3115
3116             video_title =  config['data'][0]['title']
3117             seed = config['data'][0]['seed']
3118
3119             format = self._downloader.params.get('format', None)
3120             supported_format = list(config['data'][0]['streamfileids'].keys())
3121
3122             if format is None or format == 'best':
3123                 if 'hd2' in supported_format:
3124                     format = 'hd2'
3125                 else:
3126                     format = 'flv'
3127                 ext = u'flv'
3128             elif format == 'worst':
3129                 format = 'mp4'
3130                 ext = u'mp4'
3131             else:
3132                 format = 'flv'
3133                 ext = u'flv'
3134
3135
3136             fileid = config['data'][0]['streamfileids'][format]
3137             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3138         except (UnicodeDecodeError, ValueError, KeyError):
3139             self._downloader.trouble(u'ERROR: unable to extract info section')
3140             return
3141
3142         files_info=[]
3143         sid = self._gen_sid()
3144         fileid = self._get_file_id(fileid, seed)
3145
3146         #column 8,9 of fileid represent the segment number
3147         #fileid[7:9] should be changed
3148         for index, key in enumerate(keys):
3149
3150             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3151             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3152
3153             info = {
3154                 'id': '%s_part%02d' % (video_id, index),
3155                 'url': download_url,
3156                 'uploader': None,
3157                 'upload_date': None,
3158                 'title': video_title,
3159                 'ext': ext,
3160             }
3161             files_info.append(info)
3162
3163         return files_info
3164
3165
3166 class XNXXIE(InfoExtractor):
3167     """Information extractor for xnxx.com"""
3168
3169     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3170     IE_NAME = u'xnxx'
3171     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3172     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3173     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3174
3175     def report_webpage(self, video_id):
3176         """Report information extraction"""
3177         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3178
3179     def report_extraction(self, video_id):
3180         """Report information extraction"""
3181         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3182
3183     def _real_extract(self, url):
3184         mobj = re.match(self._VALID_URL, url)
3185         if mobj is None:
3186             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3187             return
3188         video_id = mobj.group(1)
3189
3190         self.report_webpage(video_id)
3191
3192         # Get webpage content
3193         try:
3194             webpage_bytes = compat_urllib_request.urlopen(url).read()
3195             webpage = webpage_bytes.decode('utf-8')
3196         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3197             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3198             return
3199
3200         result = re.search(self.VIDEO_URL_RE, webpage)
3201         if result is None:
3202             self._downloader.trouble(u'ERROR: unable to extract video url')
3203             return
3204         video_url = compat_urllib_parse.unquote(result.group(1))
3205
3206         result = re.search(self.VIDEO_TITLE_RE, webpage)
3207         if result is None:
3208             self._downloader.trouble(u'ERROR: unable to extract video title')
3209             return
3210         video_title = result.group(1)
3211
3212         result = re.search(self.VIDEO_THUMB_RE, webpage)
3213         if result is None:
3214             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3215             return
3216         video_thumbnail = result.group(1)
3217
3218         return [{
3219             'id': video_id,
3220             'url': video_url,
3221             'uploader': None,
3222             'upload_date': None,
3223             'title': video_title,
3224             'ext': 'flv',
3225             'thumbnail': video_thumbnail,
3226             'description': None,
3227         }]
3228
3229
3230 class GooglePlusIE(InfoExtractor):
3231     """Information extractor for plus.google.com."""
3232
3233     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3234     IE_NAME = u'plus.google'
3235
3236     def __init__(self, downloader=None):
3237         InfoExtractor.__init__(self, downloader)
3238
3239     def report_extract_entry(self, url):
3240         """Report downloading extry"""
3241         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3242
3243     def report_date(self, upload_date):
3244         """Report downloading extry"""
3245         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3246
3247     def report_uploader(self, uploader):
3248         """Report downloading extry"""
3249         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3250
3251     def report_title(self, video_title):
3252         """Report downloading extry"""
3253         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3254
3255     def report_extract_vid_page(self, video_page):
3256         """Report information extraction."""
3257         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3258
3259     def _real_extract(self, url):
3260         # Extract id from URL
3261         mobj = re.match(self._VALID_URL, url)
3262         if mobj is None:
3263             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3264             return
3265
3266         post_url = mobj.group(0)
3267         video_id = mobj.group(1)
3268
3269         video_extension = 'flv'
3270
3271         # Step 1, Retrieve post webpage to extract further information
3272         self.report_extract_entry(post_url)
3273         request = compat_urllib_request.Request(post_url)
3274         try:
3275             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3276         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3277             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3278             return
3279
3280         # Extract update date
3281         upload_date = None
3282         pattern = 'title="Timestamp">(.*?)</a>'
3283         mobj = re.search(pattern, webpage)
3284         if mobj:
3285             upload_date = mobj.group(1)
3286             # Convert timestring to a format suitable for filename
3287             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3288             upload_date = upload_date.strftime('%Y%m%d')
3289         self.report_date(upload_date)
3290
3291         # Extract uploader
3292         uploader = None
3293         pattern = r'rel\="author".*?>(.*?)</a>'
3294         mobj = re.search(pattern, webpage)
3295         if mobj:
3296             uploader = mobj.group(1)
3297         self.report_uploader(uploader)
3298
3299         # Extract title
3300         # Get the first line for title
3301         video_title = u'NA'
3302         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3303         mobj = re.search(pattern, webpage)
3304         if mobj:
3305             video_title = mobj.group(1)
3306         self.report_title(video_title)
3307
3308         # Step 2, Stimulate clicking the image box to launch video
3309         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3310         mobj = re.search(pattern, webpage)
3311         if mobj is None:
3312             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3313
3314         video_page = mobj.group(1)
3315         request = compat_urllib_request.Request(video_page)
3316         try:
3317             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3318         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3320             return
3321         self.report_extract_vid_page(video_page)
3322
3323
3324         # Extract video links on video page
3325         """Extract video links of all sizes"""
3326         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3327         mobj = re.findall(pattern, webpage)
3328         if len(mobj) == 0:
3329             self._downloader.trouble(u'ERROR: unable to extract video links')
3330
3331         # Sort in resolution
3332         links = sorted(mobj)
3333
3334         # Choose the lowest of the sort, i.e. highest resolution
3335         video_url = links[-1]
3336         # Only get the url. The resolution part in the tuple has no use anymore
3337         video_url = video_url[-1]
3338         # Treat escaped \u0026 style hex
3339         try:
3340             video_url = video_url.decode("unicode_escape")
3341         except AttributeError: # Python 3
3342             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3343
3344
3345         return [{
3346             'id':       video_id,
3347             'url':      video_url,
3348             'uploader': uploader,
3349             'upload_date':  upload_date,
3350             'title':    video_title,
3351             'ext':      video_extension,
3352         }]
3353
3354 class NBAIE(InfoExtractor):
3355     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3356     IE_NAME = u'nba'
3357
3358     def _real_extract(self, url):
3359         mobj = re.match(self._VALID_URL, url)
3360         if mobj is None:
3361             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3362             return
3363
3364         video_id = mobj.group(1)
3365         if video_id.endswith('/index.html'):
3366             video_id = video_id[:-len('/index.html')]
3367
3368         webpage = self._download_webpage(url, video_id)
3369
3370         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3371         def _findProp(rexp, default=None):
3372             m = re.search(rexp, webpage)
3373             if m:
3374                 return unescapeHTML(m.group(1))
3375             else:
3376                 return default
3377
3378         shortened_video_id = video_id.rpartition('/')[2]
3379         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3380         info = {
3381             'id': shortened_video_id,
3382             'url': video_url,
3383             'ext': 'mp4',
3384             'title': title,
3385             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3386             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3387         }
3388         return [info]
3389
3390 class JustinTVIE(InfoExtractor):
3391     """Information extractor for justin.tv and twitch.tv"""
3392     # TODO: One broadcast may be split into multiple videos. The key
3393     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3394     # starts at 1 and increases. Can we treat all parts as one video?
3395
3396     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3397         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3398     _JUSTIN_PAGE_LIMIT = 100
3399     IE_NAME = u'justin.tv'
3400
3401     def report_extraction(self, file_id):
3402         """Report information extraction."""
3403         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3404
3405     def report_download_page(self, channel, offset):
3406         """Report attempt to download a single page of videos."""
3407         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3408                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3409
3410     # Return count of items, list of *valid* items
3411     def _parse_page(self, url):
3412         try:
3413             urlh = compat_urllib_request.urlopen(url)
3414             webpage_bytes = urlh.read()
3415             webpage = webpage_bytes.decode('utf-8', 'ignore')
3416         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3417             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3418             return
3419
3420         response = json.loads(webpage)
3421         if type(response) != list:
3422             error_text = response.get('error', 'unknown error')
3423             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3424             return
3425         info = []
3426         for clip in response:
3427             video_url = clip['video_file_url']
3428             if video_url:
3429                 video_extension = os.path.splitext(video_url)[1][1:]
3430                 video_date = re.sub('-', '', clip['start_time'][:10])
3431                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3432                 video_id = clip['id']
3433                 video_title = clip.get('title', video_id)
3434                 info.append({
3435                     'id': video_id,
3436                     'url': video_url,
3437                     'title': video_title,
3438                     'uploader': clip.get('channel_name', video_uploader_id),
3439                     'uploader_id': video_uploader_id,
3440                     'upload_date': video_date,
3441                     'ext': video_extension,
3442                 })
3443         return (len(response), info)
3444
3445     def _real_extract(self, url):
3446         mobj = re.match(self._VALID_URL, url)
3447         if mobj is None:
3448             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3449             return
3450
3451         api = 'http://api.justin.tv'
3452         video_id = mobj.group(mobj.lastindex)
3453         paged = False
3454         if mobj.lastindex == 1:
3455             paged = True
3456             api += '/channel/archives/%s.json'
3457         else:
3458             api += '/broadcast/by_archive/%s.json'
3459         api = api % (video_id,)
3460
3461         self.report_extraction(video_id)
3462
3463         info = []
3464         offset = 0
3465         limit = self._JUSTIN_PAGE_LIMIT
3466         while True:
3467             if paged:
3468                 self.report_download_page(video_id, offset)
3469             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3470             page_count, page_info = self._parse_page(page_url)
3471             info.extend(page_info)
3472             if not paged or page_count != limit:
3473                 break
3474             offset += limit
3475         return info
3476
3477 class FunnyOrDieIE(InfoExtractor):
3478     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3479
3480     def _real_extract(self, url):
3481         mobj = re.match(self._VALID_URL, url)
3482         if mobj is None:
3483             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3484             return
3485
3486         video_id = mobj.group('id')
3487         webpage = self._download_webpage(url, video_id)
3488
3489         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3490         if not m:
3491             self._downloader.trouble(u'ERROR: unable to find video information')
3492         video_url = unescapeHTML(m.group('url'))
3493
3494         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3495         if not m:
3496             self._downloader.trouble(u'Cannot find video title')
3497         title = unescapeHTML(m.group('title'))
3498
3499         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3500         if m:
3501             desc = unescapeHTML(m.group('desc'))
3502         else:
3503             desc = None
3504
3505         info = {
3506             'id': video_id,
3507             'url': video_url,
3508             'ext': 'mp4',
3509             'title': title,
3510             'description': desc,
3511         }
3512         return [info]
3513
3514 class TweetReelIE(InfoExtractor):
3515     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3516
3517     def _real_extract(self, url):
3518         mobj = re.match(self._VALID_URL, url)
3519         if mobj is None:
3520             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3521             return
3522
3523         video_id = mobj.group('id')
3524         webpage = self._download_webpage(url, video_id)
3525
3526         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3527         if not m:
3528             self._downloader.trouble(u'ERROR: Cannot find status ID')
3529         status_id = m.group(1)
3530
3531         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3532         if not m:
3533             self._downloader.trouble(u'WARNING: Cannot find description')
3534         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3535
3536         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3537         if not m:
3538             self._downloader.trouble(u'ERROR: Cannot find uploader')
3539         uploader = unescapeHTML(m.group('uploader'))
3540         uploader_id = unescapeHTML(m.group('uploader_id'))
3541
3542         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3543         if not m:
3544             self._downloader.trouble(u'ERROR: Cannot find upload date')
3545         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3546
3547         title = desc
3548         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3549
3550         info = {
3551             'id': video_id,
3552             'url': video_url,
3553             'ext': 'mov',
3554             'title': title,
3555             'description': desc,
3556             'uploader': uploader,
3557             'uploader_id': uploader_id,
3558             'internal_id': status_id,
3559             'upload_date': upload_date
3560         }
3561         return [info]
3562
3563 class SteamIE(InfoExtractor):
3564     _VALID_URL = r"""http://store.steampowered.com/
3565                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3566                 (?P<gameID>\d+)/?
3567                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3568                 """
3569
3570     def suitable(self, url):
3571         """Receives a URL and returns True if suitable for this IE."""
3572         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3573
3574     def _real_extract(self, url):
3575         m = re.match(self._VALID_URL, url, re.VERBOSE)
3576         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3577         gameID = m.group('gameID')
3578         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3579         webpage = self._download_webpage(videourl, gameID)
3580         mweb = re.finditer(urlRE, webpage)
3581         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3582         titles = re.finditer(namesRE, webpage)
3583         videos = []
3584         for vid,vtitle in zip(mweb,titles):
3585             video_id = vid.group('videoID')
3586             title = vtitle.group('videoName')
3587             video_url = vid.group('videoURL')
3588             if not video_url:
3589                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3590             info = {
3591                 'id':video_id,
3592                 'url':video_url,
3593                 'ext': 'flv',
3594                 'title': unescapeHTML(title)
3595                   }
3596             videos.append(info)
3597         return videos
3598
3599 class UstreamIE(InfoExtractor):
3600     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3601     IE_NAME = u'ustream'
3602
3603     def _real_extract(self, url):
3604         m = re.match(self._VALID_URL, url)
3605         video_id = m.group('videoID')
3606         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3607         webpage = self._download_webpage(url, video_id)
3608         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3609         title = m.group('title')
3610         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3611         uploader = m.group('uploader')
3612         info = {
3613                 'id':video_id,
3614                 'url':video_url,
3615                 'ext': 'flv',
3616                 'title': title,
3617                 'uploader': uploader
3618                   }
3619         return [info]
3620
3621 class RBMARadioIE(InfoExtractor):
3622     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3623
3624     def _real_extract(self, url):
3625         m = re.match(self._VALID_URL, url)
3626         video_id = m.group('videoID')
3627
3628         webpage = self._download_webpage(url, video_id)
3629         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3630         if not m:
3631             raise ExtractorError(u'Cannot find metadata')
3632         json_data = m.group(1)
3633
3634         try:
3635             data = json.loads(json_data)
3636         except ValueError as e:
3637             raise ExtractorError(u'Invalid JSON: ' + str(e))
3638
3639         video_url = data['akamai_url'] + '&cbr=256'
3640         url_parts = compat_urllib_parse_urlparse(video_url)
3641         video_ext = url_parts.path.rpartition('.')[2]
3642         info = {
3643                 'id': video_id,
3644                 'url': video_url,
3645                 'ext': video_ext,
3646                 'title': data['title'],
3647                 'description': data.get('teaser_text'),
3648                 'location': data.get('country_of_origin'),
3649                 'uploader': data.get('host', {}).get('name'),
3650                 'uploader_id': data.get('host', {}).get('slug'),
3651                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3652                 'duration': data.get('duration'),
3653         }
3654         return [info]
3655
3656
3657 class YouPornIE(InfoExtractor):
3658     """Information extractor for youporn.com."""
3659     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3660
3661     def _print_formats(self, formats):
3662         """Print all available formats"""
3663         print(u'Available formats:')
3664         print(u'ext\t\tformat')
3665         print(u'---------------------------------')
3666         for format in formats:
3667             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3668
3669     def _specific(self, req_format, formats):
3670         for x in formats:
3671             if(x["format"]==req_format):
3672                 return x
3673         return None
3674
3675     def _real_extract(self, url):
3676         mobj = re.match(self._VALID_URL, url)
3677         if mobj is None:
3678             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3679             return
3680
3681         video_id = mobj.group('videoid')
3682
3683         req = compat_urllib_request.Request(url)
3684         req.add_header('Cookie', 'age_verified=1')
3685         webpage = self._download_webpage(req, video_id)
3686
3687         # Get the video title
3688         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3689         if result is None:
3690             raise ExtractorError(u'ERROR: unable to extract video title')
3691         video_title = result.group('title').strip()
3692
3693         # Get the video date
3694         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3695         if result is None:
3696             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3697             upload_date = None
3698         else:
3699             upload_date = result.group('date').strip()
3700
3701         # Get the video uploader
3702         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3703         if result is None:
3704             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3705             video_uploader = None
3706         else:
3707             video_uploader = result.group('uploader').strip()
3708             video_uploader = clean_html( video_uploader )
3709
3710         # Get all of the formats available
3711         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3712         result = re.search(DOWNLOAD_LIST_RE, webpage)
3713         if result is None:
3714             raise ExtractorError(u'Unable to extract download list')
3715         download_list_html = result.group('download_list').strip()
3716
3717         # Get all of the links from the page
3718         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3719         links = re.findall(LINK_RE, download_list_html)
3720         if(len(links) == 0):
3721             raise ExtractorError(u'ERROR: no known formats available for video')
3722
3723         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3724
3725         formats = []
3726         for link in links:
3727
3728             # A link looks like this:
3729             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3730             # A path looks like this:
3731             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3732             video_url = unescapeHTML( link )
3733             path = compat_urllib_parse_urlparse( video_url ).path
3734             extension = os.path.splitext( path )[1][1:]
3735             format = path.split('/')[4].split('_')[:2]
3736             size = format[0]
3737             bitrate = format[1]
3738             format = "-".join( format )
3739             title = u'%s-%s-%s' % (video_title, size, bitrate)
3740
3741             formats.append({
3742                 'id': video_id,
3743                 'url': video_url,
3744                 'uploader': video_uploader,
3745                 'upload_date': upload_date,
3746                 'title': title,
3747                 'ext': extension,
3748                 'format': format,
3749                 'thumbnail': None,
3750                 'description': None,
3751                 'player_url': None
3752             })
3753
3754         if self._downloader.params.get('listformats', None):
3755             self._print_formats(formats)
3756             return
3757
3758         req_format = self._downloader.params.get('format', None)
3759         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3760
3761         if req_format is None or req_format == 'best':
3762             return [formats[0]]
3763         elif req_format == 'worst':
3764             return [formats[-1]]
3765         elif req_format in ('-1', 'all'):
3766             return formats
3767         else:
3768             format = self._specific( req_format, formats )
3769             if result is None:
3770                 self._downloader.trouble(u'ERROR: requested format not available')
3771                 return
3772             return [format]
3773
3774
3775
3776 class PornotubeIE(InfoExtractor):
3777     """Information extractor for pornotube.com."""
3778     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3779
3780     def _real_extract(self, url):
3781         mobj = re.match(self._VALID_URL, url)
3782         if mobj is None:
3783             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3784             return
3785
3786         video_id = mobj.group('videoid')
3787         video_title = mobj.group('title')
3788
3789         # Get webpage content
3790         webpage = self._download_webpage(url, video_id)
3791
3792         # Get the video URL
3793         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3794         result = re.search(VIDEO_URL_RE, webpage)
3795         if result is None:
3796             self._downloader.trouble(u'ERROR: unable to extract video url')
3797             return
3798         video_url = compat_urllib_parse.unquote(result.group('url'))
3799
3800         #Get the uploaded date
3801         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3802         result = re.search(VIDEO_UPLOADED_RE, webpage)
3803         if result is None:
3804             self._downloader.trouble(u'ERROR: unable to extract video title')
3805             return
3806         upload_date = result.group('date')
3807
3808         info = {'id': video_id,
3809                 'url': video_url,
3810                 'uploader': None,
3811                 'upload_date': upload_date,
3812                 'title': video_title,
3813                 'ext': 'flv',
3814                 'format': 'flv'}
3815
3816         return [info]
3817
3818 class YouJizzIE(InfoExtractor):
3819     """Information extractor for youjizz.com."""
3820     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3821
3822     def _real_extract(self, url):
3823         mobj = re.match(self._VALID_URL, url)
3824         if mobj is None:
3825             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3826             return
3827
3828         video_id = mobj.group('videoid')
3829
3830         # Get webpage content
3831         webpage = self._download_webpage(url, video_id)
3832
3833         # Get the video title
3834         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3835         if result is None:
3836             raise ExtractorError(u'ERROR: unable to extract video title')
3837         video_title = result.group('title').strip()
3838
3839         # Get the embed page
3840         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3841         if result is None:
3842             raise ExtractorError(u'ERROR: unable to extract embed page')
3843
3844         embed_page_url = result.group(0).strip()
3845         video_id = result.group('videoid')
3846
3847         webpage = self._download_webpage(embed_page_url, video_id)
3848
3849         # Get the video URL
3850         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3851         if result is None:
3852             raise ExtractorError(u'ERROR: unable to extract video url')
3853         video_url = result.group('source')
3854
3855         info = {'id': video_id,
3856                 'url': video_url,
3857                 'title': video_title,
3858                 'ext': 'flv',
3859                 'format': 'flv',
3860                 'player_url': embed_page_url}
3861
3862         return [info]
3863
3864 class EightTracksIE(InfoExtractor):
3865     IE_NAME = '8tracks'
3866     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3867
3868     def _real_extract(self, url):
3869         mobj = re.match(self._VALID_URL, url)
3870         if mobj is None:
3871             raise ExtractorError(u'Invalid URL: %s' % url)
3872         playlist_id = mobj.group('id')
3873
3874         webpage = self._download_webpage(url, playlist_id)
3875
3876         m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3877         if not m:
3878             raise ExtractorError(u'Cannot find trax information')
3879         json_like = m.group(1)
3880         data = json.loads(json_like)
3881
3882         session = str(random.randint(0, 1000000000))
3883         mix_id = data['id']
3884         track_count = data['tracks_count']
3885         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3886         next_url = first_url
3887         res = []
3888         for i in itertools.count():
3889             api_json = self._download_webpage(next_url, playlist_id,
3890                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3891                 errnote=u'Failed to download song information')
3892             api_data = json.loads(api_json)
3893             track_data = api_data[u'set']['track']
3894             info = {
3895                 'id': track_data['id'],
3896                 'url': track_data['track_file_stream_url'],
3897                 'title': track_data['performer'] + u' - ' + track_data['name'],
3898                 'raw_title': track_data['name'],
3899                 'uploader_id': data['user']['login'],
3900                 'ext': 'm4a',
3901             }
3902             res.append(info)
3903             if api_data['set']['at_last_track']:
3904                 break
3905             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3906         return res
3907
3908 def gen_extractors():
3909     """ Return a list of an instance of every supported extractor.
3910     The order does matter; the first extractor matched is the one handling the URL.
3911     """
3912     return [
3913         YoutubePlaylistIE(),
3914         YoutubeChannelIE(),
3915         YoutubeUserIE(),
3916         YoutubeSearchIE(),
3917         YoutubeIE(),
3918         MetacafeIE(),
3919         DailymotionIE(),
3920         GoogleSearchIE(),
3921         PhotobucketIE(),
3922         YahooIE(),
3923         YahooSearchIE(),
3924         DepositFilesIE(),
3925         FacebookIE(),
3926         BlipTVUserIE(),
3927         BlipTVIE(),
3928         VimeoIE(),
3929         MyVideoIE(),
3930         ComedyCentralIE(),
3931         EscapistIE(),
3932         CollegeHumorIE(),
3933         XVideosIE(),
3934         SoundcloudIE(),
3935         InfoQIE(),
3936         MixcloudIE(),
3937         StanfordOpenClassroomIE(),
3938         MTVIE(),
3939         YoukuIE(),
3940         XNXXIE(),
3941         YouJizzIE(),
3942         PornotubeIE(),
3943         YouPornIE(),
3944         GooglePlusIE(),
3945         ArteTvIE(),
3946         NBAIE(),
3947         JustinTVIE(),
3948         FunnyOrDieIE(),
3949         TweetReelIE(),
3950         SteamIE(),
3951         UstreamIE(),
3952         RBMARadioIE(),
3953         EightTracksIE(),
3954         GenericIE()
3955     ]
3956
3957