_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _closed_captions_xml_to_srt(self, xml_string):
 232         srt = ''
 233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 234         # TODO parse xml instead of regex
 235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 236             if not dur: dur = '4'
 237             start = float(start)
 238             end = start + float(dur)
 239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 241             caption = unescapeHTML(caption)
 242             caption = unescapeHTML(caption) # double cycle, intentional
 243             srt += str(n+1) + '\n'
 244             srt += start + ' --> ' + end + '\n'
 245             srt += caption + '\n\n'
 246         return srt
 247
 248     def _extract_subtitles(self, video_id):
 249         self.report_video_subtitles_download(video_id)
 250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 251         try:
 252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 257         if not srt_lang_list:
 258             return (u'WARNING: video has no closed captions', None)
 259         if self._downloader.params.get('subtitleslang', False):
 260             srt_lang = self._downloader.params.get('subtitleslang')
 261         elif 'en' in srt_lang_list:
 262             srt_lang = 'en'
 263         else:
 264             srt_lang = list(srt_lang_list.keys())[0]
 265         if not srt_lang in srt_lang_list:
 266             return (u'WARNING: no closed captions found in the specified language', None)
 267         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 268         try:
 269             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 270         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 271             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 272         if not srt_xml:
 273             return (u'WARNING: unable to download video subtitles', None)
 274         return (None, self._closed_captions_xml_to_srt(srt_xml))
 275
 276     def _print_formats(self, formats):
 277         print('Available formats:')
 278         for x in formats:
 279             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 280
 281     def _real_initialize(self):
 282         if self._downloader is None:
 283             return
 284
 285         username = None
 286         password = None
 287         downloader_params = self._downloader.params
 288
 289         # Attempt to use provided username and password or .netrc data
 290         if downloader_params.get('username', None) is not None:
 291             username = downloader_params['username']
 292             password = downloader_params['password']
 293         elif downloader_params.get('usenetrc', False):
 294             try:
 295                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 296                 if info is not None:
 297                     username = info[0]
 298                     password = info[2]
 299                 else:
 300                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 301             except (IOError, netrc.NetrcParseError) as err:
 302                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 303                 return
 304
 305         # Set language
 306         request = compat_urllib_request.Request(self._LANG_URL)
 307         try:
 308             self.report_lang()
 309             compat_urllib_request.urlopen(request).read()
 310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 311             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 312             return
 313
 314         # No authentication to be performed
 315         if username is None:
 316             return
 317
 318         # Log in
 319         login_form = {
 320                 'current_form': 'loginForm',
 321                 'next':     '/',
 322                 'action_login': 'Log In',
 323                 'username': username,
 324                 'password': password,
 325                 }
 326         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 327         try:
 328             self.report_login()
 329             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 331                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 332                 return
 333         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 334             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 335             return
 336
 337         # Confirm age
 338         age_form = {
 339                 'next_url':     '/',
 340                 'action_confirm':   'Confirm',
 341                 }
 342         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 343         try:
 344             self.report_age_confirmation()
 345             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 348             return
 349
 350     def _extract_id(self, url):
 351         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 352         if mobj is None:
 353             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 354             return
 355         video_id = mobj.group(2)
 356         return video_id
 357
 358     def _real_extract(self, url):
 359         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 360         mobj = re.search(self._NEXT_URL_RE, url)
 361         if mobj:
 362             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 363         video_id = self._extract_id(url)
 364
 365         # Get video webpage
 366         self.report_video_webpage_download(video_id)
 367         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 368         request = compat_urllib_request.Request(url)
 369         try:
 370             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 372             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 373             return
 374
 375         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 376
 377         # Attempt to extract SWF player URL
 378         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 379         if mobj is not None:
 380             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 381         else:
 382             player_url = None
 383
 384         # Get video info
 385         self.report_video_info_webpage_download(video_id)
 386         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 387             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 388                     % (video_id, el_type))
 389             request = compat_urllib_request.Request(video_info_url)
 390             try:
 391                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 392                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 393                 video_info = compat_parse_qs(video_info_webpage)
 394                 if 'token' in video_info:
 395                     break
 396             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 397                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 398                 return
 399         if 'token' not in video_info:
 400             if 'reason' in video_info:
 401                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 402             else:
 403                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 404             return
 405
 406         # Check for "rental" videos
 407         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 408             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 409             return
 410
 411         # Start extracting information
 412         self.report_information_extraction(video_id)
 413
 414         # uploader
 415         if 'author' not in video_info:
 416             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 417             return
 418         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 419
 420         # uploader_id
 421         video_uploader_id = None
 422         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 423         if mobj is not None:
 424             video_uploader_id = mobj.group(1)
 425         else:
 426             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 427
 428         # title
 429         if 'title' not in video_info:
 430             self._downloader.trouble(u'ERROR: unable to extract video title')
 431             return
 432         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 433
 434         # thumbnail image
 435         if 'thumbnail_url' not in video_info:
 436             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 437             video_thumbnail = ''
 438         else:   # don't panic if we can't find it
 439             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 440
 441         # upload date
 442         upload_date = None
 443         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 444         if mobj is not None:
 445             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 446             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 447             for expression in format_expressions:
 448                 try:
 449                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 450                 except:
 451                     pass
 452
 453         # description
 454         video_description = get_element_by_id("eow-description", video_webpage)
 455         if video_description:
 456             video_description = clean_html(video_description)
 457         else:
 458             video_description = ''
 459
 460         # closed captions
 461         video_subtitles = None
 462         if self._downloader.params.get('writesubtitles', False):
 463             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 464             if srt_error:
 465                 self._downloader.trouble(srt_error)
 466
 467         if 'length_seconds' not in video_info:
 468             self._downloader.trouble(u'WARNING: unable to extract video duration')
 469             video_duration = ''
 470         else:
 471             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 472
 473         # token
 474         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 475
 476         # Decide which formats to download
 477         req_format = self._downloader.params.get('format', None)
 478
 479         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 480             self.report_rtmp_download()
 481             video_url_list = [(None, video_info['conn'][0])]
 482         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 483             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 484             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 485             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 486             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 487
 488             format_limit = self._downloader.params.get('format_limit', None)
 489             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 490             if format_limit is not None and format_limit in available_formats:
 491                 format_list = available_formats[available_formats.index(format_limit):]
 492             else:
 493                 format_list = available_formats
 494             existing_formats = [x for x in format_list if x in url_map]
 495             if len(existing_formats) == 0:
 496                 self._downloader.trouble(u'ERROR: no known formats available for video')
 497                 return
 498             if self._downloader.params.get('listformats', None):
 499                 self._print_formats(existing_formats)
 500                 return
 501             if req_format is None or req_format == 'best':
 502                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 503             elif req_format == 'worst':
 504                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 505             elif req_format in ('-1', 'all'):
 506                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 507             else:
 508                 # Specific formats. We pick the first in a slash-delimeted sequence.
 509                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 510                 req_formats = req_format.split('/')
 511                 video_url_list = None
 512                 for rf in req_formats:
 513                     if rf in url_map:
 514                         video_url_list = [(rf, url_map[rf])]
 515                         break
 516                 if video_url_list is None:
 517                     self._downloader.trouble(u'ERROR: requested format not available')
 518                     return
 519         else:
 520             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 521             return
 522
 523         results = []
 524         for format_param, video_real_url in video_url_list:
 525             # Extension
 526             video_extension = self._video_extensions.get(format_param, 'flv')
 527
 528             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 529                                               self._video_dimensions.get(format_param, '???'))
 530
 531             results.append({
 532                 'id':       video_id,
 533                 'url':      video_real_url,
 534                 'uploader': video_uploader,
 535                 'uploader_id': video_uploader_id,
 536                 'upload_date':  upload_date,
 537                 'title':    video_title,
 538                 'ext':      video_extension,
 539                 'format':   video_format,
 540                 'thumbnail':    video_thumbnail,
 541                 'description':  video_description,
 542                 'player_url':   player_url,
 543                 'subtitles':    video_subtitles,
 544                 'duration':     video_duration
 545             })
 546         return results
 547
 548
 549 class MetacafeIE(InfoExtractor):
 550     """Information Extractor for metacafe.com."""
 551
 552     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 553     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 554     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 555     IE_NAME = u'metacafe'
 556
 557     def __init__(self, downloader=None):
 558         InfoExtractor.__init__(self, downloader)
 559
 560     def report_disclaimer(self):
 561         """Report disclaimer retrieval."""
 562         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 563
 564     def report_age_confirmation(self):
 565         """Report attempt to confirm age."""
 566         self._downloader.to_screen(u'[metacafe] Confirming age')
 567
 568     def report_download_webpage(self, video_id):
 569         """Report webpage download."""
 570         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 571
 572     def report_extraction(self, video_id):
 573         """Report information extraction."""
 574         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 575
 576     def _real_initialize(self):
 577         # Retrieve disclaimer
 578         request = compat_urllib_request.Request(self._DISCLAIMER)
 579         try:
 580             self.report_disclaimer()
 581             disclaimer = compat_urllib_request.urlopen(request).read()
 582         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 583             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 584             return
 585
 586         # Confirm age
 587         disclaimer_form = {
 588             'filters': '0',
 589             'submit': "Continue - I'm over 18",
 590             }
 591         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 592         try:
 593             self.report_age_confirmation()
 594             disclaimer = compat_urllib_request.urlopen(request).read()
 595         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 596             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 597             return
 598
 599     def _real_extract(self, url):
 600         # Extract id and simplified title from URL
 601         mobj = re.match(self._VALID_URL, url)
 602         if mobj is None:
 603             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 604             return
 605
 606         video_id = mobj.group(1)
 607
 608         # Check if video comes from YouTube
 609         mobj2 = re.match(r'^yt-(.*)$', video_id)
 610         if mobj2 is not None:
 611             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 612             return
 613
 614         # Retrieve video webpage to extract further information
 615         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 616         try:
 617             self.report_download_webpage(video_id)
 618             webpage = compat_urllib_request.urlopen(request).read()
 619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 620             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 621             return
 622
 623         # Extract URL, uploader and title from webpage
 624         self.report_extraction(video_id)
 625         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 626         if mobj is not None:
 627             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 628             video_extension = mediaURL[-3:]
 629
 630             # Extract gdaKey if available
 631             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 632             if mobj is None:
 633                 video_url = mediaURL
 634             else:
 635                 gdaKey = mobj.group(1)
 636                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 637         else:
 638             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 639             if mobj is None:
 640                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 641                 return
 642             vardict = compat_parse_qs(mobj.group(1))
 643             if 'mediaData' not in vardict:
 644                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 645                 return
 646             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 647             if mobj is None:
 648                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 649                 return
 650             mediaURL = mobj.group(1).replace('\\/', '/')
 651             video_extension = mediaURL[-3:]
 652             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 653
 654         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 655         if mobj is None:
 656             self._downloader.trouble(u'ERROR: unable to extract title')
 657             return
 658         video_title = mobj.group(1).decode('utf-8')
 659
 660         mobj = re.search(r'submitter=(.*?);', webpage)
 661         if mobj is None:
 662             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 663             return
 664         video_uploader = mobj.group(1)
 665
 666         return [{
 667             'id':       video_id.decode('utf-8'),
 668             'url':      video_url.decode('utf-8'),
 669             'uploader': video_uploader.decode('utf-8'),
 670             'upload_date':  None,
 671             'title':    video_title,
 672             'ext':      video_extension.decode('utf-8'),
 673         }]
 674
 675
 676 class DailymotionIE(InfoExtractor):
 677     """Information Extractor for Dailymotion"""
 678
 679     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 680     IE_NAME = u'dailymotion'
 681
 682     def __init__(self, downloader=None):
 683         InfoExtractor.__init__(self, downloader)
 684
 685     def report_extraction(self, video_id):
 686         """Report information extraction."""
 687         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 688
 689     def _real_extract(self, url):
 690         # Extract id and simplified title from URL
 691         mobj = re.match(self._VALID_URL, url)
 692         if mobj is None:
 693             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 694             return
 695
 696         video_id = mobj.group(1).split('_')[0].split('?')[0]
 697
 698         video_extension = 'mp4'
 699
 700         # Retrieve video webpage to extract further information
 701         request = compat_urllib_request.Request(url)
 702         request.add_header('Cookie', 'family_filter=off')
 703         webpage = self._download_webpage(request, video_id)
 704
 705         # Extract URL, uploader and title from webpage
 706         self.report_extraction(video_id)
 707         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 708         if mobj is None:
 709             self._downloader.trouble(u'ERROR: unable to extract media URL')
 710             return
 711         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 712
 713         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 714             if key in flashvars:
 715                 max_quality = key
 716                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 717                 break
 718         else:
 719             self._downloader.trouble(u'ERROR: unable to extract video URL')
 720             return
 721
 722         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 723         if mobj is None:
 724             self._downloader.trouble(u'ERROR: unable to extract video URL')
 725             return
 726
 727         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 728
 729         # TODO: support choosing qualities
 730
 731         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 732         if mobj is None:
 733             self._downloader.trouble(u'ERROR: unable to extract title')
 734             return
 735         video_title = unescapeHTML(mobj.group('title'))
 736
 737         video_uploader = None
 738         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 739         if mobj is None:
 740             # lookin for official user
 741             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 742             if mobj_official is None:
 743                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 744             else:
 745                 video_uploader = mobj_official.group(1)
 746         else:
 747             video_uploader = mobj.group(1)
 748
 749         video_upload_date = None
 750         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 751         if mobj is not None:
 752             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 753
 754         return [{
 755             'id':       video_id,
 756             'url':      video_url,
 757             'uploader': video_uploader,
 758             'upload_date':  video_upload_date,
 759             'title':    video_title,
 760             'ext':      video_extension,
 761         }]
 762
 763
 764 class PhotobucketIE(InfoExtractor):
 765     """Information extractor for photobucket.com."""
 766
 767     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 768     IE_NAME = u'photobucket'
 769
 770     def __init__(self, downloader=None):
 771         InfoExtractor.__init__(self, downloader)
 772
 773     def report_download_webpage(self, video_id):
 774         """Report webpage download."""
 775         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 776
 777     def report_extraction(self, video_id):
 778         """Report information extraction."""
 779         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 780
 781     def _real_extract(self, url):
 782         # Extract id from URL
 783         mobj = re.match(self._VALID_URL, url)
 784         if mobj is None:
 785             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 786             return
 787
 788         video_id = mobj.group(1)
 789
 790         video_extension = 'flv'
 791
 792         # Retrieve video webpage to extract further information
 793         request = compat_urllib_request.Request(url)
 794         try:
 795             self.report_download_webpage(video_id)
 796             webpage = compat_urllib_request.urlopen(request).read()
 797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 798             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 799             return
 800
 801         # Extract URL, uploader, and title from webpage
 802         self.report_extraction(video_id)
 803         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 804         if mobj is None:
 805             self._downloader.trouble(u'ERROR: unable to extract media URL')
 806             return
 807         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 808
 809         video_url = mediaURL
 810
 811         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 812         if mobj is None:
 813             self._downloader.trouble(u'ERROR: unable to extract title')
 814             return
 815         video_title = mobj.group(1).decode('utf-8')
 816
 817         video_uploader = mobj.group(2).decode('utf-8')
 818
 819         return [{
 820             'id':       video_id.decode('utf-8'),
 821             'url':      video_url.decode('utf-8'),
 822             'uploader': video_uploader,
 823             'upload_date':  None,
 824             'title':    video_title,
 825             'ext':      video_extension.decode('utf-8'),
 826         }]
 827
 828
 829 class YahooIE(InfoExtractor):
 830     """Information extractor for video.yahoo.com."""
 831
 832     _WORKING = False
 833     # _VALID_URL matches all Yahoo! Video URLs
 834     # _VPAGE_URL matches only the extractable '/watch/' URLs
 835     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 836     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 837     IE_NAME = u'video.yahoo'
 838
 839     def __init__(self, downloader=None):
 840         InfoExtractor.__init__(self, downloader)
 841
 842     def report_download_webpage(self, video_id):
 843         """Report webpage download."""
 844         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 845
 846     def report_extraction(self, video_id):
 847         """Report information extraction."""
 848         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 849
 850     def _real_extract(self, url, new_video=True):
 851         # Extract ID from URL
 852         mobj = re.match(self._VALID_URL, url)
 853         if mobj is None:
 854             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 855             return
 856
 857         video_id = mobj.group(2)
 858         video_extension = 'flv'
 859
 860         # Rewrite valid but non-extractable URLs as
 861         # extractable English language /watch/ URLs
 862         if re.match(self._VPAGE_URL, url) is None:
 863             request = compat_urllib_request.Request(url)
 864             try:
 865                 webpage = compat_urllib_request.urlopen(request).read()
 866             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 867                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 868                 return
 869
 870             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 871             if mobj is None:
 872                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 873                 return
 874             yahoo_id = mobj.group(1)
 875
 876             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 877             if mobj is None:
 878                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 879                 return
 880             yahoo_vid = mobj.group(1)
 881
 882             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 883             return self._real_extract(url, new_video=False)
 884
 885         # Retrieve video webpage to extract further information
 886         request = compat_urllib_request.Request(url)
 887         try:
 888             self.report_download_webpage(video_id)
 889             webpage = compat_urllib_request.urlopen(request).read()
 890         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 891             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 892             return
 893
 894         # Extract uploader and title from webpage
 895         self.report_extraction(video_id)
 896         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 897         if mobj is None:
 898             self._downloader.trouble(u'ERROR: unable to extract video title')
 899             return
 900         video_title = mobj.group(1).decode('utf-8')
 901
 902         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 903         if mobj is None:
 904             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 905             return
 906         video_uploader = mobj.group(1).decode('utf-8')
 907
 908         # Extract video thumbnail
 909         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 910         if mobj is None:
 911             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 912             return
 913         video_thumbnail = mobj.group(1).decode('utf-8')
 914
 915         # Extract video description
 916         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 917         if mobj is None:
 918             self._downloader.trouble(u'ERROR: unable to extract video description')
 919             return
 920         video_description = mobj.group(1).decode('utf-8')
 921         if not video_description:
 922             video_description = 'No description available.'
 923
 924         # Extract video height and width
 925         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 926         if mobj is None:
 927             self._downloader.trouble(u'ERROR: unable to extract video height')
 928             return
 929         yv_video_height = mobj.group(1)
 930
 931         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 932         if mobj is None:
 933             self._downloader.trouble(u'ERROR: unable to extract video width')
 934             return
 935         yv_video_width = mobj.group(1)
 936
 937         # Retrieve video playlist to extract media URL
 938         # I'm not completely sure what all these options are, but we
 939         # seem to need most of them, otherwise the server sends a 401.
 940         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 941         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 942         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 943                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 944                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 945         try:
 946             self.report_download_webpage(video_id)
 947             webpage = compat_urllib_request.urlopen(request).read()
 948         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 949             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 950             return
 951
 952         # Extract media URL from playlist XML
 953         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 954         if mobj is None:
 955             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 956             return
 957         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 958         video_url = unescapeHTML(video_url)
 959
 960         return [{
 961             'id':       video_id.decode('utf-8'),
 962             'url':      video_url,
 963             'uploader': video_uploader,
 964             'upload_date':  None,
 965             'title':    video_title,
 966             'ext':      video_extension.decode('utf-8'),
 967             'thumbnail':    video_thumbnail.decode('utf-8'),
 968             'description':  video_description,
 969         }]
 970
 971
 972 class VimeoIE(InfoExtractor):
 973     """Information extractor for vimeo.com."""
 974
 975     # _VALID_URL matches Vimeo URLs
 976     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 977     IE_NAME = u'vimeo'
 978
 979     def __init__(self, downloader=None):
 980         InfoExtractor.__init__(self, downloader)
 981
 982     def report_download_webpage(self, video_id):
 983         """Report webpage download."""
 984         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 985
 986     def report_extraction(self, video_id):
 987         """Report information extraction."""
 988         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 989
 990     def _real_extract(self, url, new_video=True):
 991         # Extract ID from URL
 992         mobj = re.match(self._VALID_URL, url)
 993         if mobj is None:
 994             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 995             return
 996
 997         video_id = mobj.group(1)
 998
 999         # Retrieve video webpage to extract further information
1000         request = compat_urllib_request.Request(url, None, std_headers)
1001         try:
1002             self.report_download_webpage(video_id)
1003             webpage_bytes = compat_urllib_request.urlopen(request).read()
1004             webpage = webpage_bytes.decode('utf-8')
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Now we begin extracting as much information as we can from what we
1010         # retrieved. First we extract the information common to all extractors,
1011         # and latter we extract those that are Vimeo specific.
1012         self.report_extraction(video_id)
1013
1014         # Extract the config JSON
1015         try:
1016             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017             config = json.loads(config)
1018         except:
1019             self._downloader.trouble(u'ERROR: unable to extract info section')
1020             return
1021
1022         # Extract title
1023         video_title = config["video"]["title"]
1024
1025         # Extract uploader and uploader_id
1026         video_uploader = config["video"]["owner"]["name"]
1027         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1028
1029         # Extract video thumbnail
1030         video_thumbnail = config["video"]["thumbnail"]
1031
1032         # Extract video description
1033         video_description = get_element_by_attribute("itemprop", "description", webpage)
1034         if video_description: video_description = clean_html(video_description)
1035         else: video_description = ''
1036
1037         # Extract upload date
1038         video_upload_date = None
1039         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1040         if mobj is not None:
1041             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1042
1043         # Vimeo specific: extract request signature and timestamp
1044         sig = config['request']['signature']
1045         timestamp = config['request']['timestamp']
1046
1047         # Vimeo specific: extract video codec and quality information
1048         # First consider quality, then codecs, then take everything
1049         # TODO bind to format param
1050         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1051         files = { 'hd': [], 'sd': [], 'other': []}
1052         for codec_name, codec_extension in codecs:
1053             if codec_name in config["video"]["files"]:
1054                 if 'hd' in config["video"]["files"][codec_name]:
1055                     files['hd'].append((codec_name, codec_extension, 'hd'))
1056                 elif 'sd' in config["video"]["files"][codec_name]:
1057                     files['sd'].append((codec_name, codec_extension, 'sd'))
1058                 else:
1059                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1060
1061         for quality in ('hd', 'sd', 'other'):
1062             if len(files[quality]) > 0:
1063                 video_quality = files[quality][0][2]
1064                 video_codec = files[quality][0][0]
1065                 video_extension = files[quality][0][1]
1066                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1067                 break
1068         else:
1069             self._downloader.trouble(u'ERROR: no known codec found')
1070             return
1071
1072         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1073                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1074
1075         return [{
1076             'id':       video_id,
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'uploader_id': video_uploader_id,
1080             'upload_date':  video_upload_date,
1081             'title':    video_title,
1082             'ext':      video_extension,
1083             'thumbnail':    video_thumbnail,
1084             'description':  video_description,
1085         }]
1086
1087
1088 class ArteTvIE(InfoExtractor):
1089     """arte.tv information extractor."""
1090
1091     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1092     _LIVE_URL = r'index-[0-9]+\.html$'
1093
1094     IE_NAME = u'arte.tv'
1095
1096     def __init__(self, downloader=None):
1097         InfoExtractor.__init__(self, downloader)
1098
1099     def report_download_webpage(self, video_id):
1100         """Report webpage download."""
1101         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1102
1103     def report_extraction(self, video_id):
1104         """Report information extraction."""
1105         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1106
1107     def fetch_webpage(self, url):
1108         request = compat_urllib_request.Request(url)
1109         try:
1110             self.report_download_webpage(url)
1111             webpage = compat_urllib_request.urlopen(request).read()
1112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1113             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1114             return
1115         except ValueError as err:
1116             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1117             return
1118         return webpage
1119
1120     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1121         page = self.fetch_webpage(url)
1122         mobj = re.search(regex, page, regexFlags)
1123         info = {}
1124
1125         if mobj is None:
1126             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1127             return
1128
1129         for (i, key, err) in matchTuples:
1130             if mobj.group(i) is None:
1131                 self._downloader.trouble(err)
1132                 return
1133             else:
1134                 info[key] = mobj.group(i)
1135
1136         return info
1137
1138     def extractLiveStream(self, url):
1139         video_lang = url.split('/')[-4]
1140         info = self.grep_webpage(
1141             url,
1142             r'src="(.*?/videothek_js.*?\.js)',
1143             0,
1144             [
1145                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1146             ]
1147         )
1148         http_host = url.split('/')[2]
1149         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1150         info = self.grep_webpage(
1151             next_url,
1152             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1153                 '(http://.*?\.swf).*?' +
1154                 '(rtmp://.*?)\'',
1155             re.DOTALL,
1156             [
1157                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1158                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1159                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1160             ]
1161         )
1162         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1163
1164     def extractPlus7Stream(self, url):
1165         video_lang = url.split('/')[-3]
1166         info = self.grep_webpage(
1167             url,
1168             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1169             0,
1170             [
1171                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1172             ]
1173         )
1174         next_url = compat_urllib_parse.unquote(info.get('url'))
1175         info = self.grep_webpage(
1176             next_url,
1177             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1178             0,
1179             [
1180                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1181             ]
1182         )
1183         next_url = compat_urllib_parse.unquote(info.get('url'))
1184
1185         info = self.grep_webpage(
1186             next_url,
1187             r'<video id="(.*?)".*?>.*?' +
1188                 '<name>(.*?)</name>.*?' +
1189                 '<dateVideo>(.*?)</dateVideo>.*?' +
1190                 '<url quality="hd">(.*?)</url>',
1191             re.DOTALL,
1192             [
1193                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1194                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1195                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1196                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1197             ]
1198         )
1199
1200         return {
1201             'id':           info.get('id'),
1202             'url':          compat_urllib_parse.unquote(info.get('url')),
1203             'uploader':     u'arte.tv',
1204             'upload_date':  info.get('date'),
1205             'title':        info.get('title').decode('utf-8'),
1206             'ext':          u'mp4',
1207             'format':       u'NA',
1208             'player_url':   None,
1209         }
1210
1211     def _real_extract(self, url):
1212         video_id = url.split('/')[-1]
1213         self.report_extraction(video_id)
1214
1215         if re.search(self._LIVE_URL, video_id) is not None:
1216             self.extractLiveStream(url)
1217             return
1218         else:
1219             info = self.extractPlus7Stream(url)
1220
1221         return [info]
1222
1223
1224 class GenericIE(InfoExtractor):
1225     """Generic last-resort information extractor."""
1226
1227     _VALID_URL = r'.*'
1228     IE_NAME = u'generic'
1229
1230     def __init__(self, downloader=None):
1231         InfoExtractor.__init__(self, downloader)
1232
1233     def report_download_webpage(self, video_id):
1234         """Report webpage download."""
1235         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1236         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1237
1238     def report_extraction(self, video_id):
1239         """Report information extraction."""
1240         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1241
1242     def report_following_redirect(self, new_url):
1243         """Report information extraction."""
1244         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1245
1246     def _test_redirect(self, url):
1247         """Check if it is a redirect, like url shorteners, in case restart chain."""
1248         class HeadRequest(compat_urllib_request.Request):
1249             def get_method(self):
1250                 return "HEAD"
1251
1252         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1253             """
1254             Subclass the HTTPRedirectHandler to make it use our
1255             HeadRequest also on the redirected URL
1256             """
1257             def redirect_request(self, req, fp, code, msg, headers, newurl):
1258                 if code in (301, 302, 303, 307):
1259                     newurl = newurl.replace(' ', '%20')
1260                     newheaders = dict((k,v) for k,v in req.headers.items()
1261                                       if k.lower() not in ("content-length", "content-type"))
1262                     return HeadRequest(newurl,
1263                                        headers=newheaders,
1264                                        origin_req_host=req.get_origin_req_host(),
1265                                        unverifiable=True)
1266                 else:
1267                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1268
1269         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1270             """
1271             Fallback to GET if HEAD is not allowed (405 HTTP error)
1272             """
1273             def http_error_405(self, req, fp, code, msg, headers):
1274                 fp.read()
1275                 fp.close()
1276
1277                 newheaders = dict((k,v) for k,v in req.headers.items()
1278                                   if k.lower() not in ("content-length", "content-type"))
1279                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1280                                                  headers=newheaders,
1281                                                  origin_req_host=req.get_origin_req_host(),
1282                                                  unverifiable=True))
1283
1284         # Build our opener
1285         opener = compat_urllib_request.OpenerDirector()
1286         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1287                         HTTPMethodFallback, HEADRedirectHandler,
1288                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1289             opener.add_handler(handler())
1290
1291         response = opener.open(HeadRequest(url))
1292         new_url = response.geturl()
1293
1294         if url == new_url:
1295             return False
1296
1297         self.report_following_redirect(new_url)
1298         self._downloader.download([new_url])
1299         return True
1300
1301     def _real_extract(self, url):
1302         if self._test_redirect(url): return
1303
1304         video_id = url.split('/')[-1]
1305         request = compat_urllib_request.Request(url)
1306         try:
1307             self.report_download_webpage(video_id)
1308             webpage = compat_urllib_request.urlopen(request).read()
1309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1310             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1311             return
1312         except ValueError as err:
1313             # since this is the last-resort InfoExtractor, if
1314             # this error is thrown, it'll be thrown here
1315             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1316             return
1317
1318         self.report_extraction(video_id)
1319         # Start with something easy: JW Player in SWFObject
1320         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1321         if mobj is None:
1322             # Broaden the search a little bit
1323             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1324         if mobj is None:
1325             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1326             return
1327
1328         # It's possible that one of the regexes
1329         # matched, but returned an empty group:
1330         if mobj.group(1) is None:
1331             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1332             return
1333
1334         video_url = compat_urllib_parse.unquote(mobj.group(1))
1335         video_id = os.path.basename(video_url)
1336
1337         # here's a fun little line of code for you:
1338         video_extension = os.path.splitext(video_id)[1][1:]
1339         video_id = os.path.splitext(video_id)[0]
1340
1341         # it's tempting to parse this further, but you would
1342         # have to take into account all the variations like
1343         #   Video Title - Site Name
1344         #   Site Name | Video Title
1345         #   Video Title - Tagline | Site Name
1346         # and so on and so forth; it's just not practical
1347         mobj = re.search(r'<title>(.*)</title>', webpage)
1348         if mobj is None:
1349             self._downloader.trouble(u'ERROR: unable to extract title')
1350             return
1351         video_title = mobj.group(1)
1352
1353         # video uploader is domain name
1354         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1355         if mobj is None:
1356             self._downloader.trouble(u'ERROR: unable to extract title')
1357             return
1358         video_uploader = mobj.group(1)
1359
1360         return [{
1361             'id':       video_id,
1362             'url':      video_url,
1363             'uploader': video_uploader,
1364             'upload_date':  None,
1365             'title':    video_title,
1366             'ext':      video_extension,
1367         }]
1368
1369
1370 class YoutubeSearchIE(InfoExtractor):
1371     """Information Extractor for YouTube search queries."""
1372     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1373     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1374     _max_youtube_results = 1000
1375     IE_NAME = u'youtube:search'
1376
1377     def __init__(self, downloader=None):
1378         InfoExtractor.__init__(self, downloader)
1379
1380     def report_download_page(self, query, pagenum):
1381         """Report attempt to download search page with given number."""
1382         query = query.decode(preferredencoding())
1383         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1384
1385     def _real_extract(self, query):
1386         mobj = re.match(self._VALID_URL, query)
1387         if mobj is None:
1388             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1389             return
1390
1391         prefix, query = query.split(':')
1392         prefix = prefix[8:]
1393         query = query.encode('utf-8')
1394         if prefix == '':
1395             self._download_n_results(query, 1)
1396             return
1397         elif prefix == 'all':
1398             self._download_n_results(query, self._max_youtube_results)
1399             return
1400         else:
1401             try:
1402                 n = int(prefix)
1403                 if n <= 0:
1404                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1405                     return
1406                 elif n > self._max_youtube_results:
1407                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1408                     n = self._max_youtube_results
1409                 self._download_n_results(query, n)
1410                 return
1411             except ValueError: # parsing prefix as integer fails
1412                 self._download_n_results(query, 1)
1413                 return
1414
1415     def _download_n_results(self, query, n):
1416         """Downloads a specified number of results for a query"""
1417
1418         video_ids = []
1419         pagenum = 0
1420         limit = n
1421
1422         while (50 * pagenum) < limit:
1423             self.report_download_page(query, pagenum+1)
1424             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1425             request = compat_urllib_request.Request(result_url)
1426             try:
1427                 data = compat_urllib_request.urlopen(request).read()
1428             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1429                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1430                 return
1431             api_response = json.loads(data)['data']
1432
1433             new_ids = list(video['id'] for video in api_response['items'])
1434             video_ids += new_ids
1435
1436             limit = min(n, api_response['totalItems'])
1437             pagenum += 1
1438
1439         if len(video_ids) > n:
1440             video_ids = video_ids[:n]
1441         for id in video_ids:
1442             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1443         return
1444
1445
1446 class GoogleSearchIE(InfoExtractor):
1447     """Information Extractor for Google Video search queries."""
1448     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1449     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1450     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1451     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1452     _max_google_results = 1000
1453     IE_NAME = u'video.google:search'
1454
1455     def __init__(self, downloader=None):
1456         InfoExtractor.__init__(self, downloader)
1457
1458     def report_download_page(self, query, pagenum):
1459         """Report attempt to download playlist page with given number."""
1460         query = query.decode(preferredencoding())
1461         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1462
1463     def _real_extract(self, query):
1464         mobj = re.match(self._VALID_URL, query)
1465         if mobj is None:
1466             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1467             return
1468
1469         prefix, query = query.split(':')
1470         prefix = prefix[8:]
1471         query = query.encode('utf-8')
1472         if prefix == '':
1473             self._download_n_results(query, 1)
1474             return
1475         elif prefix == 'all':
1476             self._download_n_results(query, self._max_google_results)
1477             return
1478         else:
1479             try:
1480                 n = int(prefix)
1481                 if n <= 0:
1482                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1483                     return
1484                 elif n > self._max_google_results:
1485                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1486                     n = self._max_google_results
1487                 self._download_n_results(query, n)
1488                 return
1489             except ValueError: # parsing prefix as integer fails
1490                 self._download_n_results(query, 1)
1491                 return
1492
1493     def _download_n_results(self, query, n):
1494         """Downloads a specified number of results for a query"""
1495
1496         video_ids = []
1497         pagenum = 0
1498
1499         while True:
1500             self.report_download_page(query, pagenum)
1501             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1502             request = compat_urllib_request.Request(result_url)
1503             try:
1504                 page = compat_urllib_request.urlopen(request).read()
1505             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1506                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1507                 return
1508
1509             # Extract video identifiers
1510             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1511                 video_id = mobj.group(1)
1512                 if video_id not in video_ids:
1513                     video_ids.append(video_id)
1514                     if len(video_ids) == n:
1515                         # Specified n videos reached
1516                         for id in video_ids:
1517                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1518                         return
1519
1520             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1521                 for id in video_ids:
1522                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1523                 return
1524
1525             pagenum = pagenum + 1
1526
1527
1528 class YahooSearchIE(InfoExtractor):
1529     """Information Extractor for Yahoo! Video search queries."""
1530
1531     _WORKING = False
1532     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1533     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1534     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1535     _MORE_PAGES_INDICATOR = r'\s*Next'
1536     _max_yahoo_results = 1000
1537     IE_NAME = u'video.yahoo:search'
1538
1539     def __init__(self, downloader=None):
1540         InfoExtractor.__init__(self, downloader)
1541
1542     def report_download_page(self, query, pagenum):
1543         """Report attempt to download playlist page with given number."""
1544         query = query.decode(preferredencoding())
1545         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1546
1547     def _real_extract(self, query):
1548         mobj = re.match(self._VALID_URL, query)
1549         if mobj is None:
1550             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1551             return
1552
1553         prefix, query = query.split(':')
1554         prefix = prefix[8:]
1555         query = query.encode('utf-8')
1556         if prefix == '':
1557             self._download_n_results(query, 1)
1558             return
1559         elif prefix == 'all':
1560             self._download_n_results(query, self._max_yahoo_results)
1561             return
1562         else:
1563             try:
1564                 n = int(prefix)
1565                 if n <= 0:
1566                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1567                     return
1568                 elif n > self._max_yahoo_results:
1569                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1570                     n = self._max_yahoo_results
1571                 self._download_n_results(query, n)
1572                 return
1573             except ValueError: # parsing prefix as integer fails
1574                 self._download_n_results(query, 1)
1575                 return
1576
1577     def _download_n_results(self, query, n):
1578         """Downloads a specified number of results for a query"""
1579
1580         video_ids = []
1581         already_seen = set()
1582         pagenum = 1
1583
1584         while True:
1585             self.report_download_page(query, pagenum)
1586             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1587             request = compat_urllib_request.Request(result_url)
1588             try:
1589                 page = compat_urllib_request.urlopen(request).read()
1590             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1591                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1592                 return
1593
1594             # Extract video identifiers
1595             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1596                 video_id = mobj.group(1)
1597                 if video_id not in already_seen:
1598                     video_ids.append(video_id)
1599                     already_seen.add(video_id)
1600                     if len(video_ids) == n:
1601                         # Specified n videos reached
1602                         for id in video_ids:
1603                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1604                         return
1605
1606             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1607                 for id in video_ids:
1608                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1609                 return
1610
1611             pagenum = pagenum + 1
1612
1613
1614 class YoutubePlaylistIE(InfoExtractor):
1615     """Information Extractor for YouTube playlists."""
1616
1617     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1618     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1619     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1620     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1621     IE_NAME = u'youtube:playlist'
1622
1623     def __init__(self, downloader=None):
1624         InfoExtractor.__init__(self, downloader)
1625
1626     def report_download_page(self, playlist_id, pagenum):
1627         """Report attempt to download playlist page with given number."""
1628         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1629
1630     def _real_extract(self, url):
1631         # Extract playlist id
1632         mobj = re.match(self._VALID_URL, url)
1633         if mobj is None:
1634             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1635             return
1636
1637         # Single video case
1638         if mobj.group(3) is not None:
1639             self._downloader.download([mobj.group(3)])
1640             return
1641
1642         # Download playlist pages
1643         # prefix is 'p' as default for playlists but there are other types that need extra care
1644         playlist_prefix = mobj.group(1)
1645         if playlist_prefix == 'a':
1646             playlist_access = 'artist'
1647         else:
1648             playlist_prefix = 'p'
1649             playlist_access = 'view_play_list'
1650         playlist_id = mobj.group(2)
1651         video_ids = []
1652         pagenum = 1
1653
1654         while True:
1655             self.report_download_page(playlist_id, pagenum)
1656             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1657             request = compat_urllib_request.Request(url)
1658             try:
1659                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1660             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1661                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1662                 return
1663
1664             # Extract video identifiers
1665             ids_in_page = []
1666             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1667                 if mobj.group(1) not in ids_in_page:
1668                     ids_in_page.append(mobj.group(1))
1669             video_ids.extend(ids_in_page)
1670
1671             if self._MORE_PAGES_INDICATOR not in page:
1672                 break
1673             pagenum = pagenum + 1
1674
1675         total = len(video_ids)
1676
1677         playliststart = self._downloader.params.get('playliststart', 1) - 1
1678         playlistend = self._downloader.params.get('playlistend', -1)
1679         if playlistend == -1:
1680             video_ids = video_ids[playliststart:]
1681         else:
1682             video_ids = video_ids[playliststart:playlistend]
1683
1684         if len(video_ids) == total:
1685             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1686         else:
1687             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1688
1689         for id in video_ids:
1690             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1691         return
1692
1693
1694 class YoutubeChannelIE(InfoExtractor):
1695     """Information Extractor for YouTube channels."""
1696
1697     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1698     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1699     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1700     IE_NAME = u'youtube:channel'
1701
1702     def report_download_page(self, channel_id, pagenum):
1703         """Report attempt to download channel page with given number."""
1704         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1705
1706     def _real_extract(self, url):
1707         # Extract channel id
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1711             return
1712
1713         # Download channel pages
1714         channel_id = mobj.group(1)
1715         video_ids = []
1716         pagenum = 1
1717
1718         while True:
1719             self.report_download_page(channel_id, pagenum)
1720             url = self._TEMPLATE_URL % (channel_id, pagenum)
1721             request = compat_urllib_request.Request(url)
1722             try:
1723                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1724             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1726                 return
1727
1728             # Extract video identifiers
1729             ids_in_page = []
1730             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1731                 if mobj.group(1) not in ids_in_page:
1732                     ids_in_page.append(mobj.group(1))
1733             video_ids.extend(ids_in_page)
1734
1735             if self._MORE_PAGES_INDICATOR not in page:
1736                 break
1737             pagenum = pagenum + 1
1738
1739         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1740
1741         for id in video_ids:
1742             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1743         return
1744
1745
1746 class YoutubeUserIE(InfoExtractor):
1747     """Information Extractor for YouTube users."""
1748
1749     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1750     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1751     _GDATA_PAGE_SIZE = 50
1752     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1753     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1754     IE_NAME = u'youtube:user'
1755
1756     def __init__(self, downloader=None):
1757         InfoExtractor.__init__(self, downloader)
1758
1759     def report_download_page(self, username, start_index):
1760         """Report attempt to download user page."""
1761         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1762                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1763
1764     def _real_extract(self, url):
1765         # Extract username
1766         mobj = re.match(self._VALID_URL, url)
1767         if mobj is None:
1768             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1769             return
1770
1771         username = mobj.group(1)
1772
1773         # Download video ids using YouTube Data API. Result size per
1774         # query is limited (currently to 50 videos) so we need to query
1775         # page by page until there are no video ids - it means we got
1776         # all of them.
1777
1778         video_ids = []
1779         pagenum = 0
1780
1781         while True:
1782             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1783             self.report_download_page(username, start_index)
1784
1785             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1786
1787             try:
1788                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1789             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1790                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1791                 return
1792
1793             # Extract video identifiers
1794             ids_in_page = []
1795
1796             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1797                 if mobj.group(1) not in ids_in_page:
1798                     ids_in_page.append(mobj.group(1))
1799
1800             video_ids.extend(ids_in_page)
1801
1802             # A little optimization - if current page is not
1803             # "full", ie. does not contain PAGE_SIZE video ids then
1804             # we can assume that this page is the last one - there
1805             # are no more ids on further pages - no need to query
1806             # again.
1807
1808             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1809                 break
1810
1811             pagenum += 1
1812
1813         all_ids_count = len(video_ids)
1814         playliststart = self._downloader.params.get('playliststart', 1) - 1
1815         playlistend = self._downloader.params.get('playlistend', -1)
1816
1817         if playlistend == -1:
1818             video_ids = video_ids[playliststart:]
1819         else:
1820             video_ids = video_ids[playliststart:playlistend]
1821
1822         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1823                 (username, all_ids_count, len(video_ids)))
1824
1825         for video_id in video_ids:
1826             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1827
1828
1829 class BlipTVUserIE(InfoExtractor):
1830     """Information Extractor for blip.tv users."""
1831
1832     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1833     _PAGE_SIZE = 12
1834     IE_NAME = u'blip.tv:user'
1835
1836     def __init__(self, downloader=None):
1837         InfoExtractor.__init__(self, downloader)
1838
1839     def report_download_page(self, username, pagenum):
1840         """Report attempt to download user page."""
1841         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1842                 (self.IE_NAME, username, pagenum))
1843
1844     def _real_extract(self, url):
1845         # Extract username
1846         mobj = re.match(self._VALID_URL, url)
1847         if mobj is None:
1848             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1849             return
1850
1851         username = mobj.group(1)
1852
1853         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1854
1855         request = compat_urllib_request.Request(url)
1856
1857         try:
1858             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             mobj = re.search(r'data-users-id="([^"]+)"', page)
1860             page_base = page_base % mobj.group(1)
1861         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1863             return
1864
1865
1866         # Download video ids using BlipTV Ajax calls. Result size per
1867         # query is limited (currently to 12 videos) so we need to query
1868         # page by page until there are no video ids - it means we got
1869         # all of them.
1870
1871         video_ids = []
1872         pagenum = 1
1873
1874         while True:
1875             self.report_download_page(username, pagenum)
1876
1877             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1878
1879             try:
1880                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1881             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1882                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1883                 return
1884
1885             # Extract video identifiers
1886             ids_in_page = []
1887
1888             for mobj in re.finditer(r'href="/([^"]+)"', page):
1889                 if mobj.group(1) not in ids_in_page:
1890                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1891
1892             video_ids.extend(ids_in_page)
1893
1894             # A little optimization - if current page is not
1895             # "full", ie. does not contain PAGE_SIZE video ids then
1896             # we can assume that this page is the last one - there
1897             # are no more ids on further pages - no need to query
1898             # again.
1899
1900             if len(ids_in_page) < self._PAGE_SIZE:
1901                 break
1902
1903             pagenum += 1
1904
1905         all_ids_count = len(video_ids)
1906         playliststart = self._downloader.params.get('playliststart', 1) - 1
1907         playlistend = self._downloader.params.get('playlistend', -1)
1908
1909         if playlistend == -1:
1910             video_ids = video_ids[playliststart:]
1911         else:
1912             video_ids = video_ids[playliststart:playlistend]
1913
1914         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1915                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1916
1917         for video_id in video_ids:
1918             self._downloader.download([u'http://blip.tv/'+video_id])
1919
1920
1921 class DepositFilesIE(InfoExtractor):
1922     """Information extractor for depositfiles.com"""
1923
1924     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1925
1926     def report_download_webpage(self, file_id):
1927         """Report webpage download."""
1928         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1929
1930     def report_extraction(self, file_id):
1931         """Report information extraction."""
1932         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1933
1934     def _real_extract(self, url):
1935         file_id = url.split('/')[-1]
1936         # Rebuild url in english locale
1937         url = 'http://depositfiles.com/en/files/' + file_id
1938
1939         # Retrieve file webpage with 'Free download' button pressed
1940         free_download_indication = { 'gateway_result' : '1' }
1941         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1942         try:
1943             self.report_download_webpage(file_id)
1944             webpage = compat_urllib_request.urlopen(request).read()
1945         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1946             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1947             return
1948
1949         # Search for the real file URL
1950         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1951         if (mobj is None) or (mobj.group(1) is None):
1952             # Try to figure out reason of the error.
1953             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1954             if (mobj is not None) and (mobj.group(1) is not None):
1955                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1956                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1957             else:
1958                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1959             return
1960
1961         file_url = mobj.group(1)
1962         file_extension = os.path.splitext(file_url)[1][1:]
1963
1964         # Search for file title
1965         mobj = re.search(r'<b title="(.*?)">', webpage)
1966         if mobj is None:
1967             self._downloader.trouble(u'ERROR: unable to extract title')
1968             return
1969         file_title = mobj.group(1).decode('utf-8')
1970
1971         return [{
1972             'id':       file_id.decode('utf-8'),
1973             'url':      file_url.decode('utf-8'),
1974             'uploader': None,
1975             'upload_date':  None,
1976             'title':    file_title,
1977             'ext':      file_extension.decode('utf-8'),
1978         }]
1979
1980
1981 class FacebookIE(InfoExtractor):
1982     """Information Extractor for Facebook"""
1983
1984     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986     _NETRC_MACHINE = 'facebook'
1987     IE_NAME = u'facebook'
1988
1989     def report_login(self):
1990         """Report attempt to log in."""
1991         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1992
1993     def _real_initialize(self):
1994         if self._downloader is None:
1995             return
1996
1997         useremail = None
1998         password = None
1999         downloader_params = self._downloader.params
2000
2001         # Attempt to use provided username and password or .netrc data
2002         if downloader_params.get('username', None) is not None:
2003             useremail = downloader_params['username']
2004             password = downloader_params['password']
2005         elif downloader_params.get('usenetrc', False):
2006             try:
2007                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2008                 if info is not None:
2009                     useremail = info[0]
2010                     password = info[2]
2011                 else:
2012                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2013             except (IOError, netrc.NetrcParseError) as err:
2014                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2015                 return
2016
2017         if useremail is None:
2018             return
2019
2020         # Log in
2021         login_form = {
2022             'email': useremail,
2023             'pass': password,
2024             'login': 'Log+In'
2025             }
2026         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2027         try:
2028             self.report_login()
2029             login_results = compat_urllib_request.urlopen(request).read()
2030             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2031                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2032                 return
2033         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2035             return
2036
2037     def _real_extract(self, url):
2038         mobj = re.match(self._VALID_URL, url)
2039         if mobj is None:
2040             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2041             return
2042         video_id = mobj.group('ID')
2043
2044         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2045         webpage = self._download_webpage(url, video_id)
2046
2047         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2048         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2049         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2050         if not m:
2051             raise ExtractorError(u'Cannot parse data')
2052         data = dict(json.loads(m.group(1)))
2053         video_url = compat_urllib_parse.unquote(data['hd_src'])
2054         video_duration = int(data['video_duration'])
2055
2056         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2057         if not m:
2058             raise ExtractorError(u'Cannot find title in webpage')
2059         video_title = unescapeHTML(m.group(1))
2060
2061         info = {
2062             'id': video_id,
2063             'title': video_title,
2064             'url': video_url,
2065             'ext': 'mp4',
2066             'duration': video_duration,
2067             'thumbnail': data['thumbnail_src'],
2068         }
2069         return [info]
2070
2071
2072 class BlipTVIE(InfoExtractor):
2073     """Information extractor for blip.tv"""
2074
2075     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2076     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2077     IE_NAME = u'blip.tv'
2078
2079     def report_extraction(self, file_id):
2080         """Report information extraction."""
2081         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2082
2083     def report_direct_download(self, title):
2084         """Report information extraction."""
2085         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2086
2087     def _real_extract(self, url):
2088         mobj = re.match(self._VALID_URL, url)
2089         if mobj is None:
2090             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2091             return
2092
2093         if '?' in url:
2094             cchar = '&'
2095         else:
2096             cchar = '?'
2097         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2098         request = compat_urllib_request.Request(json_url)
2099         request.add_header('User-Agent', 'iTunes/10.6.1')
2100         self.report_extraction(mobj.group(1))
2101         info = None
2102         try:
2103             urlh = compat_urllib_request.urlopen(request)
2104             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2105                 basename = url.split('/')[-1]
2106                 title,ext = os.path.splitext(basename)
2107                 title = title.decode('UTF-8')
2108                 ext = ext.replace('.', '')
2109                 self.report_direct_download(title)
2110                 info = {
2111                     'id': title,
2112                     'url': url,
2113                     'uploader': None,
2114                     'upload_date': None,
2115                     'title': title,
2116                     'ext': ext,
2117                     'urlhandle': urlh
2118                 }
2119         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2120             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2121         if info is None: # Regular URL
2122             try:
2123                 json_code_bytes = urlh.read()
2124                 json_code = json_code_bytes.decode('utf-8')
2125             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2126                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2127                 return
2128
2129             try:
2130                 json_data = json.loads(json_code)
2131                 if 'Post' in json_data:
2132                     data = json_data['Post']
2133                 else:
2134                     data = json_data
2135
2136                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2137                 video_url = data['media']['url']
2138                 umobj = re.match(self._URL_EXT, video_url)
2139                 if umobj is None:
2140                     raise ValueError('Can not determine filename extension')
2141                 ext = umobj.group(1)
2142
2143                 info = {
2144                     'id': data['item_id'],
2145                     'url': video_url,
2146                     'uploader': data['display_name'],
2147                     'upload_date': upload_date,
2148                     'title': data['title'],
2149                     'ext': ext,
2150                     'format': data['media']['mimeType'],
2151                     'thumbnail': data['thumbnailUrl'],
2152                     'description': data['description'],
2153                     'player_url': data['embedUrl'],
2154                     'user_agent': 'iTunes/10.6.1',
2155                 }
2156             except (ValueError,KeyError) as err:
2157                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2158                 return
2159
2160         return [info]
2161
2162
2163 class MyVideoIE(InfoExtractor):
2164     """Information Extractor for myvideo.de."""
2165
2166     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2167     IE_NAME = u'myvideo'
2168
2169     def __init__(self, downloader=None):
2170         InfoExtractor.__init__(self, downloader)
2171
2172     def report_extraction(self, video_id):
2173         """Report information extraction."""
2174         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2175
2176     def _real_extract(self,url):
2177         mobj = re.match(self._VALID_URL, url)
2178         if mobj is None:
2179             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2180             return
2181
2182         video_id = mobj.group(1)
2183
2184         # Get video webpage
2185         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2186         webpage = self._download_webpage(webpage_url, video_id)
2187
2188         self.report_extraction(video_id)
2189         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2190                  webpage)
2191         if mobj is None:
2192             self._downloader.trouble(u'ERROR: unable to extract media URL')
2193             return
2194         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2195
2196         mobj = re.search('<title>([^<]+)</title>', webpage)
2197         if mobj is None:
2198             self._downloader.trouble(u'ERROR: unable to extract title')
2199             return
2200
2201         video_title = mobj.group(1)
2202
2203         return [{
2204             'id':       video_id,
2205             'url':      video_url,
2206             'uploader': None,
2207             'upload_date':  None,
2208             'title':    video_title,
2209             'ext':      u'flv',
2210         }]
2211
2212 class ComedyCentralIE(InfoExtractor):
2213     """Information extractor for The Daily Show and Colbert Report """
2214
2215     # urls can be abbreviations like :thedailyshow or :colbert
2216     # urls for episodes like:
2217     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2218     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2219     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2220     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2221                       |(https?://)?(www\.)?
2222                           (?P<showname>thedailyshow|colbertnation)\.com/
2223                          (full-episodes/(?P<episode>.*)|
2224                           (?P<clip>
2225                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2226                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2227                      $"""
2228
2229     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2230
2231     _video_extensions = {
2232         '3500': 'mp4',
2233         '2200': 'mp4',
2234         '1700': 'mp4',
2235         '1200': 'mp4',
2236         '750': 'mp4',
2237         '400': 'mp4',
2238     }
2239     _video_dimensions = {
2240         '3500': '1280x720',
2241         '2200': '960x540',
2242         '1700': '768x432',
2243         '1200': '640x360',
2244         '750': '512x288',
2245         '400': '384x216',
2246     }
2247
2248     def suitable(self, url):
2249         """Receives a URL and returns True if suitable for this IE."""
2250         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2251
2252     def report_extraction(self, episode_id):
2253         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2254
2255     def report_config_download(self, episode_id, media_id):
2256         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2257
2258     def report_index_download(self, episode_id):
2259         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2260
2261     def _print_formats(self, formats):
2262         print('Available formats:')
2263         for x in formats:
2264             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2265
2266
2267     def _real_extract(self, url):
2268         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269         if mobj is None:
2270             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2271             return
2272
2273         if mobj.group('shortname'):
2274             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2275                 url = u'http://www.thedailyshow.com/full-episodes/'
2276             else:
2277                 url = u'http://www.colbertnation.com/full-episodes/'
2278             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2279             assert mobj is not None
2280
2281         if mobj.group('clip'):
2282             if mobj.group('showname') == 'thedailyshow':
2283                 epTitle = mobj.group('tdstitle')
2284             else:
2285                 epTitle = mobj.group('cntitle')
2286             dlNewest = False
2287         else:
2288             dlNewest = not mobj.group('episode')
2289             if dlNewest:
2290                 epTitle = mobj.group('showname')
2291             else:
2292                 epTitle = mobj.group('episode')
2293
2294         req = compat_urllib_request.Request(url)
2295         self.report_extraction(epTitle)
2296         try:
2297             htmlHandle = compat_urllib_request.urlopen(req)
2298             html = htmlHandle.read()
2299             webpage = html.decode('utf-8')
2300         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2301             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2302             return
2303         if dlNewest:
2304             url = htmlHandle.geturl()
2305             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2306             if mobj is None:
2307                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2308                 return
2309             if mobj.group('episode') == '':
2310                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2311                 return
2312             epTitle = mobj.group('episode')
2313
2314         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2315
2316         if len(mMovieParams) == 0:
2317             # The Colbert Report embeds the information in a without
2318             # a URL prefix; so extract the alternate reference
2319             # and then add the URL prefix manually.
2320
2321             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2322             if len(altMovieParams) == 0:
2323                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2324                 return
2325             else:
2326                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2327
2328         uri = mMovieParams[0][1]
2329         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2330         self.report_index_download(epTitle)
2331         try:
2332             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2333         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2334             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2335             return
2336
2337         results = []
2338
2339         idoc = xml.etree.ElementTree.fromstring(indexXml)
2340         itemEls = idoc.findall('.//item')
2341         for partNum,itemEl in enumerate(itemEls):
2342             mediaId = itemEl.findall('./guid')[0].text
2343             shortMediaId = mediaId.split(':')[-1]
2344             showId = mediaId.split(':')[-2].replace('.com', '')
2345             officialTitle = itemEl.findall('./title')[0].text
2346             officialDate = itemEl.findall('./pubDate')[0].text
2347
2348             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2349                         compat_urllib_parse.urlencode({'uri': mediaId}))
2350             configReq = compat_urllib_request.Request(configUrl)
2351             self.report_config_download(epTitle, shortMediaId)
2352             try:
2353                 configXml = compat_urllib_request.urlopen(configReq).read()
2354             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2355                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2356                 return
2357
2358             cdoc = xml.etree.ElementTree.fromstring(configXml)
2359             turls = []
2360             for rendition in cdoc.findall('.//rendition'):
2361                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2362                 turls.append(finfo)
2363
2364             if len(turls) == 0:
2365                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2366                 continue
2367
2368             if self._downloader.params.get('listformats', None):
2369                 self._print_formats([i[0] for i in turls])
2370                 return
2371
2372             # For now, just pick the highest bitrate
2373             format,rtmp_video_url = turls[-1]
2374
2375             # Get the format arg from the arg stream
2376             req_format = self._downloader.params.get('format', None)
2377
2378             # Select format if we can find one
2379             for f,v in turls:
2380                 if f == req_format:
2381                     format, rtmp_video_url = f, v
2382                     break
2383
2384             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2385             if not m:
2386                 raise ExtractorError(u'Cannot transform RTMP url')
2387             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2388             video_url = base + m.group('finalid')
2389
2390             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2391             info = {
2392                 'id': shortMediaId,
2393                 'url': video_url,
2394                 'uploader': showId,
2395                 'upload_date': officialDate,
2396                 'title': effTitle,
2397                 'ext': 'mp4',
2398                 'format': format,
2399                 'thumbnail': None,
2400                 'description': officialTitle,
2401             }
2402             results.append(info)
2403
2404         return results
2405
2406
2407 class EscapistIE(InfoExtractor):
2408     """Information extractor for The Escapist """
2409
2410     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2411     IE_NAME = u'escapist'
2412
2413     def report_extraction(self, showName):
2414         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2415
2416     def report_config_download(self, showName):
2417         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2418
2419     def _real_extract(self, url):
2420         mobj = re.match(self._VALID_URL, url)
2421         if mobj is None:
2422             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2423             return
2424         showName = mobj.group('showname')
2425         videoId = mobj.group('episode')
2426
2427         self.report_extraction(showName)
2428         try:
2429             webPage = compat_urllib_request.urlopen(url)
2430             webPageBytes = webPage.read()
2431             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2432             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2433         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2434             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2435             return
2436
2437         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2438         description = unescapeHTML(descMatch.group(1))
2439         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2440         imgUrl = unescapeHTML(imgMatch.group(1))
2441         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2442         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2443         configUrlMatch = re.search('config=(.*)$', playerUrl)
2444         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2445
2446         self.report_config_download(showName)
2447         try:
2448             configJSON = compat_urllib_request.urlopen(configUrl)
2449             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2450             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2453             return
2454
2455         # Technically, it's JavaScript, not JSON
2456         configJSON = configJSON.replace("'", '"')
2457
2458         try:
2459             config = json.loads(configJSON)
2460         except (ValueError,) as err:
2461             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2462             return
2463
2464         playlist = config['playlist']
2465         videoUrl = playlist[1]['url']
2466
2467         info = {
2468             'id': videoId,
2469             'url': videoUrl,
2470             'uploader': showName,
2471             'upload_date': None,
2472             'title': showName,
2473             'ext': 'flv',
2474             'thumbnail': imgUrl,
2475             'description': description,
2476             'player_url': playerUrl,
2477         }
2478
2479         return [info]
2480
2481 class CollegeHumorIE(InfoExtractor):
2482     """Information extractor for collegehumor.com"""
2483
2484     _WORKING = False
2485     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2486     IE_NAME = u'collegehumor'
2487
2488     def report_manifest(self, video_id):
2489         """Report information extraction."""
2490         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2491
2492     def report_extraction(self, video_id):
2493         """Report information extraction."""
2494         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2495
2496     def _real_extract(self, url):
2497         mobj = re.match(self._VALID_URL, url)
2498         if mobj is None:
2499             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2500             return
2501         video_id = mobj.group('videoid')
2502
2503         info = {
2504             'id': video_id,
2505             'uploader': None,
2506             'upload_date': None,
2507         }
2508
2509         self.report_extraction(video_id)
2510         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2511         try:
2512             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2513         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2514             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2515             return
2516
2517         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2518         try:
2519             videoNode = mdoc.findall('./video')[0]
2520             info['description'] = videoNode.findall('./description')[0].text
2521             info['title'] = videoNode.findall('./caption')[0].text
2522             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2523             manifest_url = videoNode.findall('./file')[0].text
2524         except IndexError:
2525             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2526             return
2527
2528         manifest_url += '?hdcore=2.10.3'
2529         self.report_manifest(video_id)
2530         try:
2531             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2532         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2533             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2534             return
2535
2536         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2537         try:
2538             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2539             node_id = media_node.attrib['url']
2540             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2541         except IndexError as err:
2542             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2543             return
2544
2545         url_pr = compat_urllib_parse_urlparse(manifest_url)
2546         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2547
2548         info['url'] = url
2549         info['ext'] = 'f4f'
2550         return [info]
2551
2552
2553 class XVideosIE(InfoExtractor):
2554     """Information extractor for xvideos.com"""
2555
2556     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2557     IE_NAME = u'xvideos'
2558
2559     def report_extraction(self, video_id):
2560         """Report information extraction."""
2561         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2562
2563     def _real_extract(self, url):
2564         mobj = re.match(self._VALID_URL, url)
2565         if mobj is None:
2566             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2567             return
2568         video_id = mobj.group(1)
2569
2570         webpage = self._download_webpage(url, video_id)
2571
2572         self.report_extraction(video_id)
2573
2574
2575         # Extract video URL
2576         mobj = re.search(r'flv_url=(.+?)&', webpage)
2577         if mobj is None:
2578             self._downloader.trouble(u'ERROR: unable to extract video url')
2579             return
2580         video_url = compat_urllib_parse.unquote(mobj.group(1))
2581
2582
2583         # Extract title
2584         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2585         if mobj is None:
2586             self._downloader.trouble(u'ERROR: unable to extract video title')
2587             return
2588         video_title = mobj.group(1)
2589
2590
2591         # Extract video thumbnail
2592         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2593         if mobj is None:
2594             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2595             return
2596         video_thumbnail = mobj.group(0)
2597
2598         info = {
2599             'id': video_id,
2600             'url': video_url,
2601             'uploader': None,
2602             'upload_date': None,
2603             'title': video_title,
2604             'ext': 'flv',
2605             'thumbnail': video_thumbnail,
2606             'description': None,
2607         }
2608
2609         return [info]
2610
2611
2612 class SoundcloudIE(InfoExtractor):
2613     """Information extractor for soundcloud.com
2614        To access the media, the uid of the song and a stream token
2615        must be extracted from the page source and the script must make
2616        a request to media.soundcloud.com/crossdomain.xml. Then
2617        the media can be grabbed by requesting from an url composed
2618        of the stream token and uid
2619      """
2620
2621     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2622     IE_NAME = u'soundcloud'
2623
2624     def __init__(self, downloader=None):
2625         InfoExtractor.__init__(self, downloader)
2626
2627     def report_resolve(self, video_id):
2628         """Report information extraction."""
2629         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2630
2631     def report_extraction(self, video_id):
2632         """Report information extraction."""
2633         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2634
2635     def _real_extract(self, url):
2636         mobj = re.match(self._VALID_URL, url)
2637         if mobj is None:
2638             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639             return
2640
2641         # extract uploader (which is in the url)
2642         uploader = mobj.group(1)
2643         # extract simple title (uploader + slug of song title)
2644         slug_title =  mobj.group(2)
2645         simple_title = uploader + u'-' + slug_title
2646
2647         self.report_resolve('%s/%s' % (uploader, slug_title))
2648
2649         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2650         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651         request = compat_urllib_request.Request(resolv_url)
2652         try:
2653             info_json_bytes = compat_urllib_request.urlopen(request).read()
2654             info_json = info_json_bytes.decode('utf-8')
2655         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2656             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2657             return
2658
2659         info = json.loads(info_json)
2660         video_id = info['id']
2661         self.report_extraction('%s/%s' % (uploader, slug_title))
2662
2663         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2664         request = compat_urllib_request.Request(streams_url)
2665         try:
2666             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2667             stream_json = stream_json_bytes.decode('utf-8')
2668         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2670             return
2671
2672         streams = json.loads(stream_json)
2673         mediaURL = streams['http_mp3_128_url']
2674
2675         return [{
2676             'id':       info['id'],
2677             'url':      mediaURL,
2678             'uploader': info['user']['username'],
2679             'upload_date':  info['created_at'],
2680             'title':    info['title'],
2681             'ext':      u'mp3',
2682             'description': info['description'],
2683         }]
2684
2685
2686 class InfoQIE(InfoExtractor):
2687     """Information extractor for infoq.com"""
2688     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2689
2690     def report_extraction(self, video_id):
2691         """Report information extraction."""
2692         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2693
2694     def _real_extract(self, url):
2695         mobj = re.match(self._VALID_URL, url)
2696         if mobj is None:
2697             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2698             return
2699
2700         webpage = self._download_webpage(url, video_id=url)
2701         self.report_extraction(url)
2702
2703         # Extract video URL
2704         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2705         if mobj is None:
2706             self._downloader.trouble(u'ERROR: unable to extract video url')
2707             return
2708         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2709         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2710
2711         # Extract title
2712         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2713         if mobj is None:
2714             self._downloader.trouble(u'ERROR: unable to extract video title')
2715             return
2716         video_title = mobj.group(1)
2717
2718         # Extract description
2719         video_description = u'No description available.'
2720         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2721         if mobj is not None:
2722             video_description = mobj.group(1)
2723
2724         video_filename = video_url.split('/')[-1]
2725         video_id, extension = video_filename.split('.')
2726
2727         info = {
2728             'id': video_id,
2729             'url': video_url,
2730             'uploader': None,
2731             'upload_date': None,
2732             'title': video_title,
2733             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2734             'thumbnail': None,
2735             'description': video_description,
2736         }
2737
2738         return [info]
2739
2740 class MixcloudIE(InfoExtractor):
2741     """Information extractor for www.mixcloud.com"""
2742
2743     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2744     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2745     IE_NAME = u'mixcloud'
2746
2747     def __init__(self, downloader=None):
2748         InfoExtractor.__init__(self, downloader)
2749
2750     def report_download_json(self, file_id):
2751         """Report JSON download."""
2752         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2753
2754     def report_extraction(self, file_id):
2755         """Report information extraction."""
2756         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2757
2758     def get_urls(self, jsonData, fmt, bitrate='best'):
2759         """Get urls from 'audio_formats' section in json"""
2760         file_url = None
2761         try:
2762             bitrate_list = jsonData[fmt]
2763             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764                 bitrate = max(bitrate_list) # select highest
2765
2766             url_list = jsonData[fmt][bitrate]
2767         except TypeError: # we have no bitrate info.
2768             url_list = jsonData[fmt]
2769         return url_list
2770
2771     def check_urls(self, url_list):
2772         """Returns 1st active url from list"""
2773         for url in url_list:
2774             try:
2775                 compat_urllib_request.urlopen(url)
2776                 return url
2777             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2778                 url = None
2779
2780         return None
2781
2782     def _print_formats(self, formats):
2783         print('Available formats:')
2784         for fmt in formats.keys():
2785             for b in formats[fmt]:
2786                 try:
2787                     ext = formats[fmt][b][0]
2788                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2789                 except TypeError: # we have no bitrate info
2790                     ext = formats[fmt][0]
2791                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2792                     break
2793
2794     def _real_extract(self, url):
2795         mobj = re.match(self._VALID_URL, url)
2796         if mobj is None:
2797             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798             return
2799         # extract uploader & filename from url
2800         uploader = mobj.group(1).decode('utf-8')
2801         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2802
2803         # construct API request
2804         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2805         # retrieve .json file with links to files
2806         request = compat_urllib_request.Request(file_url)
2807         try:
2808             self.report_download_json(file_url)
2809             jsonData = compat_urllib_request.urlopen(request).read()
2810         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2811             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2812             return
2813
2814         # parse JSON
2815         json_data = json.loads(jsonData)
2816         player_url = json_data['player_swf_url']
2817         formats = dict(json_data['audio_formats'])
2818
2819         req_format = self._downloader.params.get('format', None)
2820         bitrate = None
2821
2822         if self._downloader.params.get('listformats', None):
2823             self._print_formats(formats)
2824             return
2825
2826         if req_format is None or req_format == 'best':
2827             for format_param in formats.keys():
2828                 url_list = self.get_urls(formats, format_param)
2829                 # check urls
2830                 file_url = self.check_urls(url_list)
2831                 if file_url is not None:
2832                     break # got it!
2833         else:
2834             if req_format not in formats:
2835                 self._downloader.trouble(u'ERROR: format is not available')
2836                 return
2837
2838             url_list = self.get_urls(formats, req_format)
2839             file_url = self.check_urls(url_list)
2840             format_param = req_format
2841
2842         return [{
2843             'id': file_id.decode('utf-8'),
2844             'url': file_url.decode('utf-8'),
2845             'uploader': uploader.decode('utf-8'),
2846             'upload_date': None,
2847             'title': json_data['name'],
2848             'ext': file_url.split('.')[-1].decode('utf-8'),
2849             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2850             'thumbnail': json_data['thumbnail_url'],
2851             'description': json_data['description'],
2852             'player_url': player_url.decode('utf-8'),
2853         }]
2854
2855 class StanfordOpenClassroomIE(InfoExtractor):
2856     """Information extractor for Stanford's Open ClassRoom"""
2857
2858     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2859     IE_NAME = u'stanfordoc'
2860
2861     def report_download_webpage(self, objid):
2862         """Report information extraction."""
2863         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2864
2865     def report_extraction(self, video_id):
2866         """Report information extraction."""
2867         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2868
2869     def _real_extract(self, url):
2870         mobj = re.match(self._VALID_URL, url)
2871         if mobj is None:
2872             raise ExtractorError(u'Invalid URL: %s' % url)
2873
2874         if mobj.group('course') and mobj.group('video'): # A specific video
2875             course = mobj.group('course')
2876             video = mobj.group('video')
2877             info = {
2878                 'id': course + '_' + video,
2879                 'uploader': None,
2880                 'upload_date': None,
2881             }
2882
2883             self.report_extraction(info['id'])
2884             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2885             xmlUrl = baseUrl + video + '.xml'
2886             try:
2887                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2888             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2889                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2890                 return
2891             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2892             try:
2893                 info['title'] = mdoc.findall('./title')[0].text
2894                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2895             except IndexError:
2896                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2897                 return
2898             info['ext'] = info['url'].rpartition('.')[2]
2899             return [info]
2900         elif mobj.group('course'): # A course page
2901             course = mobj.group('course')
2902             info = {
2903                 'id': course,
2904                 'type': 'playlist',
2905                 'uploader': None,
2906                 'upload_date': None,
2907             }
2908
2909             coursepage = self._download_webpage(url, info['id'],
2910                                         note='Downloading course info page',
2911                                         errnote='Unable to download course info page')
2912
2913             m = re.search('<h1>([^<]+)</h1>', coursepage)
2914             if m:
2915                 info['title'] = unescapeHTML(m.group(1))
2916             else:
2917                 info['title'] = info['id']
2918
2919             m = re.search('<description>([^<]+)</description>', coursepage)
2920             if m:
2921                 info['description'] = unescapeHTML(m.group(1))
2922
2923             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2924             info['list'] = [
2925                 {
2926                     'type': 'reference',
2927                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2928                 }
2929                     for vpage in links]
2930             results = []
2931             for entry in info['list']:
2932                 assert entry['type'] == 'reference'
2933                 results += self.extract(entry['url'])
2934             return results
2935         else: # Root page
2936             info = {
2937                 'id': 'Stanford OpenClassroom',
2938                 'type': 'playlist',
2939                 'uploader': None,
2940                 'upload_date': None,
2941             }
2942
2943             self.report_download_webpage(info['id'])
2944             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2945             try:
2946                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2947             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2948                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2949                 return
2950
2951             info['title'] = info['id']
2952
2953             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2954             info['list'] = [
2955                 {
2956                     'type': 'reference',
2957                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2958                 }
2959                     for cpage in links]
2960
2961             results = []
2962             for entry in info['list']:
2963                 assert entry['type'] == 'reference'
2964                 results += self.extract(entry['url'])
2965             return results
2966
2967 class MTVIE(InfoExtractor):
2968     """Information extractor for MTV.com"""
2969
2970     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2971     IE_NAME = u'mtv'
2972
2973     def report_extraction(self, video_id):
2974         """Report information extraction."""
2975         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2976
2977     def _real_extract(self, url):
2978         mobj = re.match(self._VALID_URL, url)
2979         if mobj is None:
2980             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2981             return
2982         if not mobj.group('proto'):
2983             url = 'http://' + url
2984         video_id = mobj.group('videoid')
2985
2986         webpage = self._download_webpage(url, video_id)
2987
2988         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2989         if mobj is None:
2990             self._downloader.trouble(u'ERROR: unable to extract song name')
2991             return
2992         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2993         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2994         if mobj is None:
2995             self._downloader.trouble(u'ERROR: unable to extract performer')
2996             return
2997         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2998         video_title = performer + ' - ' + song_name
2999
3000         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3001         if mobj is None:
3002             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3003             return
3004         mtvn_uri = mobj.group(1)
3005
3006         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3007         if mobj is None:
3008             self._downloader.trouble(u'ERROR: unable to extract content id')
3009             return
3010         content_id = mobj.group(1)
3011
3012         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3013         self.report_extraction(video_id)
3014         request = compat_urllib_request.Request(videogen_url)
3015         try:
3016             metadataXml = compat_urllib_request.urlopen(request).read()
3017         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3018             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3019             return
3020
3021         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3022         renditions = mdoc.findall('.//rendition')
3023
3024         # For now, always pick the highest quality.
3025         rendition = renditions[-1]
3026
3027         try:
3028             _,_,ext = rendition.attrib['type'].partition('/')
3029             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3030             video_url = rendition.find('./src').text
3031         except KeyError:
3032             self._downloader.trouble('Invalid rendition field.')
3033             return
3034
3035         info = {
3036             'id': video_id,
3037             'url': video_url,
3038             'uploader': performer,
3039             'upload_date': None,
3040             'title': video_title,
3041             'ext': ext,
3042             'format': format,
3043         }
3044
3045         return [info]
3046
3047
3048 class YoukuIE(InfoExtractor):
3049     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3050
3051     def report_download_webpage(self, file_id):
3052         """Report webpage download."""
3053         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3054
3055     def report_extraction(self, file_id):
3056         """Report information extraction."""
3057         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3058
3059     def _gen_sid(self):
3060         nowTime = int(time.time() * 1000)
3061         random1 = random.randint(1000,1998)
3062         random2 = random.randint(1000,9999)
3063
3064         return "%d%d%d" %(nowTime,random1,random2)
3065
3066     def _get_file_ID_mix_string(self, seed):
3067         mixed = []
3068         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3069         seed = float(seed)
3070         for i in range(len(source)):
3071             seed  =  (seed * 211 + 30031 ) % 65536
3072             index  =  math.floor(seed / 65536 * len(source) )
3073             mixed.append(source[int(index)])
3074             source.remove(source[int(index)])
3075         #return ''.join(mixed)
3076         return mixed
3077
3078     def _get_file_id(self, fileId, seed):
3079         mixed = self._get_file_ID_mix_string(seed)
3080         ids = fileId.split('*')
3081         realId = []
3082         for ch in ids:
3083             if ch:
3084                 realId.append(mixed[int(ch)])
3085         return ''.join(realId)
3086
3087     def _real_extract(self, url):
3088         mobj = re.match(self._VALID_URL, url)
3089         if mobj is None:
3090             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3091             return
3092         video_id = mobj.group('ID')
3093
3094         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3095
3096         request = compat_urllib_request.Request(info_url, None, std_headers)
3097         try:
3098             self.report_download_webpage(video_id)
3099             jsondata = compat_urllib_request.urlopen(request).read()
3100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3101             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3102             return
3103
3104         self.report_extraction(video_id)
3105         try:
3106             jsonstr = jsondata.decode('utf-8')
3107             config = json.loads(jsonstr)
3108
3109             video_title =  config['data'][0]['title']
3110             seed = config['data'][0]['seed']
3111
3112             format = self._downloader.params.get('format', None)
3113             supported_format = list(config['data'][0]['streamfileids'].keys())
3114
3115             if format is None or format == 'best':
3116                 if 'hd2' in supported_format:
3117                     format = 'hd2'
3118                 else:
3119                     format = 'flv'
3120                 ext = u'flv'
3121             elif format == 'worst':
3122                 format = 'mp4'
3123                 ext = u'mp4'
3124             else:
3125                 format = 'flv'
3126                 ext = u'flv'
3127
3128
3129             fileid = config['data'][0]['streamfileids'][format]
3130             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3131         except (UnicodeDecodeError, ValueError, KeyError):
3132             self._downloader.trouble(u'ERROR: unable to extract info section')
3133             return
3134
3135         files_info=[]
3136         sid = self._gen_sid()
3137         fileid = self._get_file_id(fileid, seed)
3138
3139         #column 8,9 of fileid represent the segment number
3140         #fileid[7:9] should be changed
3141         for index, key in enumerate(keys):
3142
3143             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3144             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3145
3146             info = {
3147                 'id': '%s_part%02d' % (video_id, index),
3148                 'url': download_url,
3149                 'uploader': None,
3150                 'upload_date': None,
3151                 'title': video_title,
3152                 'ext': ext,
3153             }
3154             files_info.append(info)
3155
3156         return files_info
3157
3158
3159 class XNXXIE(InfoExtractor):
3160     """Information extractor for xnxx.com"""
3161
3162     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3163     IE_NAME = u'xnxx'
3164     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3165     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3166     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3167
3168     def report_webpage(self, video_id):
3169         """Report information extraction"""
3170         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3171
3172     def report_extraction(self, video_id):
3173         """Report information extraction"""
3174         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3175
3176     def _real_extract(self, url):
3177         mobj = re.match(self._VALID_URL, url)
3178         if mobj is None:
3179             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3180             return
3181         video_id = mobj.group(1)
3182
3183         self.report_webpage(video_id)
3184
3185         # Get webpage content
3186         try:
3187             webpage_bytes = compat_urllib_request.urlopen(url).read()
3188             webpage = webpage_bytes.decode('utf-8')
3189         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3190             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3191             return
3192
3193         result = re.search(self.VIDEO_URL_RE, webpage)
3194         if result is None:
3195             self._downloader.trouble(u'ERROR: unable to extract video url')
3196             return
3197         video_url = compat_urllib_parse.unquote(result.group(1))
3198
3199         result = re.search(self.VIDEO_TITLE_RE, webpage)
3200         if result is None:
3201             self._downloader.trouble(u'ERROR: unable to extract video title')
3202             return
3203         video_title = result.group(1)
3204
3205         result = re.search(self.VIDEO_THUMB_RE, webpage)
3206         if result is None:
3207             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3208             return
3209         video_thumbnail = result.group(1)
3210
3211         return [{
3212             'id': video_id,
3213             'url': video_url,
3214             'uploader': None,
3215             'upload_date': None,
3216             'title': video_title,
3217             'ext': 'flv',
3218             'thumbnail': video_thumbnail,
3219             'description': None,
3220         }]
3221
3222
3223 class GooglePlusIE(InfoExtractor):
3224     """Information extractor for plus.google.com."""
3225
3226     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3227     IE_NAME = u'plus.google'
3228
3229     def __init__(self, downloader=None):
3230         InfoExtractor.__init__(self, downloader)
3231
3232     def report_extract_entry(self, url):
3233         """Report downloading extry"""
3234         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3235
3236     def report_date(self, upload_date):
3237         """Report downloading extry"""
3238         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3239
3240     def report_uploader(self, uploader):
3241         """Report downloading extry"""
3242         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3243
3244     def report_title(self, video_title):
3245         """Report downloading extry"""
3246         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3247
3248     def report_extract_vid_page(self, video_page):
3249         """Report information extraction."""
3250         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3251
3252     def _real_extract(self, url):
3253         # Extract id from URL
3254         mobj = re.match(self._VALID_URL, url)
3255         if mobj is None:
3256             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3257             return
3258
3259         post_url = mobj.group(0)
3260         video_id = mobj.group(1)
3261
3262         video_extension = 'flv'
3263
3264         # Step 1, Retrieve post webpage to extract further information
3265         self.report_extract_entry(post_url)
3266         request = compat_urllib_request.Request(post_url)
3267         try:
3268             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3269         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3270             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3271             return
3272
3273         # Extract update date
3274         upload_date = None
3275         pattern = 'title="Timestamp">(.*?)</a>'
3276         mobj = re.search(pattern, webpage)
3277         if mobj:
3278             upload_date = mobj.group(1)
3279             # Convert timestring to a format suitable for filename
3280             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3281             upload_date = upload_date.strftime('%Y%m%d')
3282         self.report_date(upload_date)
3283
3284         # Extract uploader
3285         uploader = None
3286         pattern = r'rel\="author".*?>(.*?)</a>'
3287         mobj = re.search(pattern, webpage)
3288         if mobj:
3289             uploader = mobj.group(1)
3290         self.report_uploader(uploader)
3291
3292         # Extract title
3293         # Get the first line for title
3294         video_title = u'NA'
3295         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3296         mobj = re.search(pattern, webpage)
3297         if mobj:
3298             video_title = mobj.group(1)
3299         self.report_title(video_title)
3300
3301         # Step 2, Stimulate clicking the image box to launch video
3302         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3303         mobj = re.search(pattern, webpage)
3304         if mobj is None:
3305             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3306
3307         video_page = mobj.group(1)
3308         request = compat_urllib_request.Request(video_page)
3309         try:
3310             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3312             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3313             return
3314         self.report_extract_vid_page(video_page)
3315
3316
3317         # Extract video links on video page
3318         """Extract video links of all sizes"""
3319         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3320         mobj = re.findall(pattern, webpage)
3321         if len(mobj) == 0:
3322             self._downloader.trouble(u'ERROR: unable to extract video links')
3323
3324         # Sort in resolution
3325         links = sorted(mobj)
3326
3327         # Choose the lowest of the sort, i.e. highest resolution
3328         video_url = links[-1]
3329         # Only get the url. The resolution part in the tuple has no use anymore
3330         video_url = video_url[-1]
3331         # Treat escaped \u0026 style hex
3332         try:
3333             video_url = video_url.decode("unicode_escape")
3334         except AttributeError: # Python 3
3335             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3336
3337
3338         return [{
3339             'id':       video_id,
3340             'url':      video_url,
3341             'uploader': uploader,
3342             'upload_date':  upload_date,
3343             'title':    video_title,
3344             'ext':      video_extension,
3345         }]
3346
3347 class NBAIE(InfoExtractor):
3348     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3349     IE_NAME = u'nba'
3350
3351     def _real_extract(self, url):
3352         mobj = re.match(self._VALID_URL, url)
3353         if mobj is None:
3354             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3355             return
3356
3357         video_id = mobj.group(1)
3358         if video_id.endswith('/index.html'):
3359             video_id = video_id[:-len('/index.html')]
3360
3361         webpage = self._download_webpage(url, video_id)
3362
3363         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3364         def _findProp(rexp, default=None):
3365             m = re.search(rexp, webpage)
3366             if m:
3367                 return unescapeHTML(m.group(1))
3368             else:
3369                 return default
3370
3371         shortened_video_id = video_id.rpartition('/')[2]
3372         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3373         info = {
3374             'id': shortened_video_id,
3375             'url': video_url,
3376             'ext': 'mp4',
3377             'title': title,
3378             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3379             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3380         }
3381         return [info]
3382
3383 class JustinTVIE(InfoExtractor):
3384     """Information extractor for justin.tv and twitch.tv"""
3385     # TODO: One broadcast may be split into multiple videos. The key
3386     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3387     # starts at 1 and increases. Can we treat all parts as one video?
3388
3389     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3390         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3391     _JUSTIN_PAGE_LIMIT = 100
3392     IE_NAME = u'justin.tv'
3393
3394     def report_extraction(self, file_id):
3395         """Report information extraction."""
3396         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3397
3398     def report_download_page(self, channel, offset):
3399         """Report attempt to download a single page of videos."""
3400         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3401                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3402
3403     # Return count of items, list of *valid* items
3404     def _parse_page(self, url):
3405         try:
3406             urlh = compat_urllib_request.urlopen(url)
3407             webpage_bytes = urlh.read()
3408             webpage = webpage_bytes.decode('utf-8', 'ignore')
3409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3410             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3411             return
3412
3413         response = json.loads(webpage)
3414         if type(response) != list:
3415             error_text = response.get('error', 'unknown error')
3416             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3417             return
3418         info = []
3419         for clip in response:
3420             video_url = clip['video_file_url']
3421             if video_url:
3422                 video_extension = os.path.splitext(video_url)[1][1:]
3423                 video_date = re.sub('-', '', clip['start_time'][:10])
3424                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3425                 video_id = clip['id']
3426                 video_title = clip.get('title', video_id)
3427                 info.append({
3428                     'id': video_id,
3429                     'url': video_url,
3430                     'title': video_title,
3431                     'uploader': clip.get('channel_name', video_uploader_id),
3432                     'uploader_id': video_uploader_id,
3433                     'upload_date': video_date,
3434                     'ext': video_extension,
3435                 })
3436         return (len(response), info)
3437
3438     def _real_extract(self, url):
3439         mobj = re.match(self._VALID_URL, url)
3440         if mobj is None:
3441             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3442             return
3443
3444         api = 'http://api.justin.tv'
3445         video_id = mobj.group(mobj.lastindex)
3446         paged = False
3447         if mobj.lastindex == 1:
3448             paged = True
3449             api += '/channel/archives/%s.json'
3450         else:
3451             api += '/broadcast/by_archive/%s.json'
3452         api = api % (video_id,)
3453
3454         self.report_extraction(video_id)
3455
3456         info = []
3457         offset = 0
3458         limit = self._JUSTIN_PAGE_LIMIT
3459         while True:
3460             if paged:
3461                 self.report_download_page(video_id, offset)
3462             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3463             page_count, page_info = self._parse_page(page_url)
3464             info.extend(page_info)
3465             if not paged or page_count != limit:
3466                 break
3467             offset += limit
3468         return info
3469
3470 class FunnyOrDieIE(InfoExtractor):
3471     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3472
3473     def _real_extract(self, url):
3474         mobj = re.match(self._VALID_URL, url)
3475         if mobj is None:
3476             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3477             return
3478
3479         video_id = mobj.group('id')
3480         webpage = self._download_webpage(url, video_id)
3481
3482         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3483         if not m:
3484             self._downloader.trouble(u'ERROR: unable to find video information')
3485         video_url = unescapeHTML(m.group('url'))
3486
3487         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3488         if not m:
3489             self._downloader.trouble(u'Cannot find video title')
3490         title = unescapeHTML(m.group('title'))
3491
3492         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3493         if m:
3494             desc = unescapeHTML(m.group('desc'))
3495         else:
3496             desc = None
3497
3498         info = {
3499             'id': video_id,
3500             'url': video_url,
3501             'ext': 'mp4',
3502             'title': title,
3503             'description': desc,
3504         }
3505         return [info]
3506
3507 class TweetReelIE(InfoExtractor):
3508     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3509
3510     def _real_extract(self, url):
3511         mobj = re.match(self._VALID_URL, url)
3512         if mobj is None:
3513             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3514             return
3515
3516         video_id = mobj.group('id')
3517         webpage = self._download_webpage(url, video_id)
3518
3519         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3520         if not m:
3521             self._downloader.trouble(u'ERROR: Cannot find status ID')
3522         status_id = m.group(1)
3523
3524         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3525         if not m:
3526             self._downloader.trouble(u'WARNING: Cannot find description')
3527         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3528
3529         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3530         if not m:
3531             self._downloader.trouble(u'ERROR: Cannot find uploader')
3532         uploader = unescapeHTML(m.group('uploader'))
3533         uploader_id = unescapeHTML(m.group('uploader_id'))
3534
3535         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3536         if not m:
3537             self._downloader.trouble(u'ERROR: Cannot find upload date')
3538         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3539
3540         title = desc
3541         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3542
3543         info = {
3544             'id': video_id,
3545             'url': video_url,
3546             'ext': 'mov',
3547             'title': title,
3548             'description': desc,
3549             'uploader': uploader,
3550             'uploader_id': uploader_id,
3551             'internal_id': status_id,
3552             'upload_date': upload_date
3553         }
3554         return [info]
3555
3556 class SteamIE(InfoExtractor):
3557     _VALID_URL = r"""http://store.steampowered.com/
3558                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3559                 (?P<gameID>\d+)/?
3560                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3561                 """
3562
3563     def suitable(self, url):
3564         """Receives a URL and returns True if suitable for this IE."""
3565         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3566
3567     def _real_extract(self, url):
3568         m = re.match(self._VALID_URL, url, re.VERBOSE)
3569         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3570         gameID = m.group('gameID')
3571         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3572         webpage = self._download_webpage(videourl, gameID)
3573         mweb = re.finditer(urlRE, webpage)
3574         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3575         titles = re.finditer(namesRE, webpage)
3576         videos = []
3577         for vid,vtitle in zip(mweb,titles):
3578             video_id = vid.group('videoID')
3579             title = vtitle.group('videoName')
3580             video_url = vid.group('videoURL')
3581             if not video_url:
3582                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3583             info = {
3584                 'id':video_id,
3585                 'url':video_url,
3586                 'ext': 'flv',
3587                 'title': unescapeHTML(title)
3588                   }
3589             videos.append(info)
3590         return videos
3591
3592 class UstreamIE(InfoExtractor):
3593     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3594     IE_NAME = u'ustream'
3595
3596     def _real_extract(self, url):
3597         m = re.match(self._VALID_URL, url)
3598         video_id = m.group('videoID')
3599         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3600         webpage = self._download_webpage(url, video_id)
3601         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3602         title = m.group('title')
3603         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3604         uploader = m.group('uploader')
3605         info = {
3606                 'id':video_id,
3607                 'url':video_url,
3608                 'ext': 'flv',
3609                 'title': title,
3610                 'uploader': uploader
3611                   }
3612         return [info]
3613
3614 class RBMARadioIE(InfoExtractor):
3615     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3616
3617     def _real_extract(self, url):
3618         m = re.match(self._VALID_URL, url)
3619         video_id = m.group('videoID')
3620
3621         webpage = self._download_webpage(url, video_id)
3622         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3623         if not m:
3624             raise ExtractorError(u'Cannot find metadata')
3625         json_data = m.group(1)
3626
3627         try:
3628             data = json.loads(json_data)
3629         except ValueError as e:
3630             raise ExtractorError(u'Invalid JSON: ' + str(e))
3631
3632         video_url = data['akamai_url'] + '&cbr=256'
3633         url_parts = compat_urllib_parse_urlparse(video_url)
3634         video_ext = url_parts.path.rpartition('.')[2]
3635         info = {
3636                 'id': video_id,
3637                 'url': video_url,
3638                 'ext': video_ext,
3639                 'title': data['title'],
3640                 'description': data.get('teaser_text'),
3641                 'location': data.get('country_of_origin'),
3642                 'uploader': data.get('host', {}).get('name'),
3643                 'uploader_id': data.get('host', {}).get('slug'),
3644                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3645                 'duration': data.get('duration'),
3646         }
3647         return [info]
3648
3649
3650 class YouPornIE(InfoExtractor):
3651     """Information extractor for youporn.com."""
3652     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3653
3654     def _print_formats(self, formats):
3655         """Print all available formats"""
3656         print(u'Available formats:')
3657         print(u'ext\t\tformat')
3658         print(u'---------------------------------')
3659         for format in formats:
3660             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3661
3662     def _specific(self, req_format, formats):
3663         for x in formats:
3664             if(x["format"]==req_format):
3665                 return x
3666         return None
3667
3668     def _real_extract(self, url):
3669         mobj = re.match(self._VALID_URL, url)
3670         if mobj is None:
3671             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3672             return
3673
3674         video_id = mobj.group('videoid')
3675
3676         req = compat_urllib_request.Request(url)
3677         req.add_header('Cookie', 'age_verified=1')
3678         webpage = self._download_webpage(req, video_id)
3679
3680         # Get the video title
3681         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3682         if result is None:
3683             raise ExtractorError(u'ERROR: unable to extract video title')
3684         video_title = result.group('title').strip()
3685
3686         # Get the video date
3687         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3688         if result is None:
3689             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3690             upload_date = None
3691         else:
3692             upload_date = result.group('date').strip()
3693
3694         # Get the video uploader
3695         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3696         if result is None:
3697             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3698             video_uploader = None
3699         else:
3700             video_uploader = result.group('uploader').strip()
3701             video_uploader = clean_html( video_uploader )
3702
3703         # Get all of the formats available
3704         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3705         result = re.search(DOWNLOAD_LIST_RE, webpage)
3706         if result is None:
3707             raise ExtractorError(u'Unable to extract download list')
3708         download_list_html = result.group('download_list').strip()
3709
3710         # Get all of the links from the page
3711         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3712         links = re.findall(LINK_RE, download_list_html)
3713         if(len(links) == 0):
3714             raise ExtractorError(u'ERROR: no known formats available for video')
3715
3716         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3717
3718         formats = []
3719         for link in links:
3720
3721             # A link looks like this:
3722             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3723             # A path looks like this:
3724             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3725             video_url = unescapeHTML( link )
3726             path = compat_urllib_parse_urlparse( video_url ).path
3727             extension = os.path.splitext( path )[1][1:]
3728             format = path.split('/')[4].split('_')[:2]
3729             size = format[0]
3730             bitrate = format[1]
3731             format = "-".join( format )
3732             title = u'%s-%s-%s' % (video_title, size, bitrate)
3733
3734             formats.append({
3735                 'id': video_id,
3736                 'url': video_url,
3737                 'uploader': video_uploader,
3738                 'upload_date': upload_date,
3739                 'title': title,
3740                 'ext': extension,
3741                 'format': format,
3742                 'thumbnail': None,
3743                 'description': None,
3744                 'player_url': None
3745             })
3746
3747         if self._downloader.params.get('listformats', None):
3748             self._print_formats(formats)
3749             return
3750
3751         req_format = self._downloader.params.get('format', None)
3752         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3753
3754         if req_format is None or req_format == 'best':
3755             return [formats[0]]
3756         elif req_format == 'worst':
3757             return [formats[-1]]
3758         elif req_format in ('-1', 'all'):
3759             return formats
3760         else:
3761             format = self._specific( req_format, formats )
3762             if result is None:
3763                 self._downloader.trouble(u'ERROR: requested format not available')
3764                 return
3765             return [format]
3766
3767
3768
3769 class PornotubeIE(InfoExtractor):
3770     """Information extractor for pornotube.com."""
3771     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3772
3773     def _real_extract(self, url):
3774         mobj = re.match(self._VALID_URL, url)
3775         if mobj is None:
3776             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3777             return
3778
3779         video_id = mobj.group('videoid')
3780         video_title = mobj.group('title')
3781
3782         # Get webpage content
3783         webpage = self._download_webpage(url, video_id)
3784
3785         # Get the video URL
3786         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3787         result = re.search(VIDEO_URL_RE, webpage)
3788         if result is None:
3789             self._downloader.trouble(u'ERROR: unable to extract video url')
3790             return
3791         video_url = compat_urllib_parse.unquote(result.group('url'))
3792
3793         #Get the uploaded date
3794         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3795         result = re.search(VIDEO_UPLOADED_RE, webpage)
3796         if result is None:
3797             self._downloader.trouble(u'ERROR: unable to extract video title')
3798             return
3799         upload_date = result.group('date')
3800
3801         info = {'id': video_id,
3802                 'url': video_url,
3803                 'uploader': None,
3804                 'upload_date': upload_date,
3805                 'title': video_title,
3806                 'ext': 'flv',
3807                 'format': 'flv'}
3808
3809         return [info]
3810
3811 class YouJizzIE(InfoExtractor):
3812     """Information extractor for youjizz.com."""
3813     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3814
3815     def _real_extract(self, url):
3816         mobj = re.match(self._VALID_URL, url)
3817         if mobj is None:
3818             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3819             return
3820
3821         video_id = mobj.group('videoid')
3822
3823         # Get webpage content
3824         webpage = self._download_webpage(url, video_id)
3825
3826         # Get the video title
3827         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3828         if result is None:
3829             raise ExtractorError(u'ERROR: unable to extract video title')
3830         video_title = result.group('title').strip()
3831
3832         # Get the embed page
3833         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3834         if result is None:
3835             raise ExtractorError(u'ERROR: unable to extract embed page')
3836
3837         embed_page_url = result.group(0).strip()
3838         video_id = result.group('videoid')
3839
3840         webpage = self._download_webpage(embed_page_url, video_id)
3841
3842         # Get the video URL
3843         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3844         if result is None:
3845             raise ExtractorError(u'ERROR: unable to extract video url')
3846         video_url = result.group('source')
3847
3848         info = {'id': video_id,
3849                 'url': video_url,
3850                 'title': video_title,
3851                 'ext': 'flv',
3852                 'format': 'flv',
3853                 'player_url': embed_page_url}
3854
3855         return [info]
3856
3857 class EightTracksIE(InfoExtractor):
3858     IE_NAME = '8tracks'
3859     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3860
3861     def _real_extract(self, url):
3862         mobj = re.match(self._VALID_URL, url)
3863         if mobj is None:
3864             raise ExtractorError(u'Invalid URL: %s' % url)
3865         playlist_id = mobj.group('id')
3866
3867         webpage = self._download_webpage(url, playlist_id)
3868
3869         m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3870         if not m:
3871             raise ExtractorError(u'Cannot find trax information')
3872         json_like = m.group(1)
3873         data = json.loads(json_like)
3874
3875         session = str(random.randint(0, 1000000000))
3876         mix_id = data['id']
3877         track_count = data['tracks_count']
3878         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3879         next_url = first_url
3880         res = []
3881         for i in itertools.count():
3882             api_json = self._download_webpage(next_url, playlist_id,
3883                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3884                 errnote=u'Failed to download song information')
3885             api_data = json.loads(api_json)
3886             track_data = api_data[u'set']['track']
3887             info = {
3888                 'id': track_data['id'],
3889                 'url': track_data['track_file_stream_url'],
3890                 'title': track_data['performer'] + u' - ' + track_data['name'],
3891                 'raw_title': track_data['name'],
3892                 'uploader_id': data['user']['login'],
3893                 'ext': 'm4a',
3894             }
3895             res.append(info)
3896             if api_data['set']['at_last_track']:
3897                 break
3898             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3899         return res
3900
3901 def gen_extractors():
3902     """ Return a list of an instance of every supported extractor.
3903     The order does matter; the first extractor matched is the one handling the URL.
3904     """
3905     return [
3906         YoutubePlaylistIE(),
3907         YoutubeChannelIE(),
3908         YoutubeUserIE(),
3909         YoutubeSearchIE(),
3910         YoutubeIE(),
3911         MetacafeIE(),
3912         DailymotionIE(),
3913         GoogleSearchIE(),
3914         PhotobucketIE(),
3915         YahooIE(),
3916         YahooSearchIE(),
3917         DepositFilesIE(),
3918         FacebookIE(),
3919         BlipTVUserIE(),
3920         BlipTVIE(),
3921         VimeoIE(),
3922         MyVideoIE(),
3923         ComedyCentralIE(),
3924         EscapistIE(),
3925         CollegeHumorIE(),
3926         XVideosIE(),
3927         SoundcloudIE(),
3928         InfoQIE(),
3929         MixcloudIE(),
3930         StanfordOpenClassroomIE(),
3931         MTVIE(),
3932         YoukuIE(),
3933         XNXXIE(),
3934         YouJizzIE(),
3935         PornotubeIE(),
3936         YouPornIE(),
3937         GooglePlusIE(),
3938         ArteTvIE(),
3939         NBAIE(),
3940         JustinTVIE(),
3941         FunnyOrDieIE(),
3942         TweetReelIE(),
3943         SteamIE(),
3944         UstreamIE(),
3945         RBMARadioIE(),
3946         EightTracksIE(),
3947         GenericIE()
3948     ]
3949
3950