_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _closed_captions_xml_to_srt(self, xml_string):
 232         srt = ''
 233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 234         # TODO parse xml instead of regex
 235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 236             if not dur: dur = '4'
 237             start = float(start)
 238             end = start + float(dur)
 239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 241             caption = unescapeHTML(caption)
 242             caption = unescapeHTML(caption) # double cycle, intentional
 243             srt += str(n+1) + '\n'
 244             srt += start + ' --> ' + end + '\n'
 245             srt += caption + '\n\n'
 246         return srt
 247
 248     def _extract_subtitles(self, video_id):
 249         self.report_video_subtitles_download(video_id)
 250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 251         try:
 252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 257         if not srt_lang_list:
 258             return (u'WARNING: video has no closed captions', None)
 259         if self._downloader.params.get('subtitleslang', False):
 260             srt_lang = self._downloader.params.get('subtitleslang')
 261         elif 'en' in srt_lang_list:
 262             srt_lang = 'en'
 263         else:
 264             srt_lang = list(srt_lang_list.keys())[0]
 265         if not srt_lang in srt_lang_list:
 266             return (u'WARNING: no closed captions found in the specified language', None)
 267         request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 268         try:
 269             srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
 270         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 271             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 272         if not srt_xml:
 273             return (u'WARNING: unable to download video subtitles', None)
 274         return (None, self._closed_captions_xml_to_srt(srt_xml))
 275
 276     def _print_formats(self, formats):
 277         print('Available formats:')
 278         for x in formats:
 279             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 280
 281     def _real_initialize(self):
 282         if self._downloader is None:
 283             return
 284
 285         username = None
 286         password = None
 287         downloader_params = self._downloader.params
 288
 289         # Attempt to use provided username and password or .netrc data
 290         if downloader_params.get('username', None) is not None:
 291             username = downloader_params['username']
 292             password = downloader_params['password']
 293         elif downloader_params.get('usenetrc', False):
 294             try:
 295                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 296                 if info is not None:
 297                     username = info[0]
 298                     password = info[2]
 299                 else:
 300                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 301             except (IOError, netrc.NetrcParseError) as err:
 302                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 303                 return
 304
 305         # Set language
 306         request = compat_urllib_request.Request(self._LANG_URL)
 307         try:
 308             self.report_lang()
 309             compat_urllib_request.urlopen(request).read()
 310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 311             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 312             return
 313
 314         # No authentication to be performed
 315         if username is None:
 316             return
 317
 318         # Log in
 319         login_form = {
 320                 'current_form': 'loginForm',
 321                 'next':     '/',
 322                 'action_login': 'Log In',
 323                 'username': username,
 324                 'password': password,
 325                 }
 326         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 327         try:
 328             self.report_login()
 329             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330             if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 331                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 332                 return
 333         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 334             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 335             return
 336
 337         # Confirm age
 338         age_form = {
 339                 'next_url':     '/',
 340                 'action_confirm':   'Confirm',
 341                 }
 342         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 343         try:
 344             self.report_age_confirmation()
 345             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 348             return
 349
 350     def _extract_id(self, url):
 351         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 352         if mobj is None:
 353             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 354             return
 355         video_id = mobj.group(2)
 356         return video_id
 357
 358     def _real_extract(self, url):
 359         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 360         mobj = re.search(self._NEXT_URL_RE, url)
 361         if mobj:
 362             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 363         video_id = self._extract_id(url)
 364
 365         # Get video webpage
 366         self.report_video_webpage_download(video_id)
 367         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 368         request = compat_urllib_request.Request(url)
 369         try:
 370             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 372             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 373             return
 374
 375         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 376
 377         # Attempt to extract SWF player URL
 378         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 379         if mobj is not None:
 380             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 381         else:
 382             player_url = None
 383
 384         # Get video info
 385         self.report_video_info_webpage_download(video_id)
 386         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 387             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 388                     % (video_id, el_type))
 389             request = compat_urllib_request.Request(video_info_url)
 390             try:
 391                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 392                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 393                 video_info = compat_parse_qs(video_info_webpage)
 394                 if 'token' in video_info:
 395                     break
 396             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 397                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 398                 return
 399         if 'token' not in video_info:
 400             if 'reason' in video_info:
 401                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 402             else:
 403                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 404             return
 405
 406         # Check for "rental" videos
 407         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 408             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 409             return
 410
 411         # Start extracting information
 412         self.report_information_extraction(video_id)
 413
 414         # uploader
 415         if 'author' not in video_info:
 416             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 417             return
 418         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 419
 420         # uploader_id
 421         video_uploader_id = None
 422         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 423         if mobj is not None:
 424             video_uploader_id = mobj.group(1)
 425         else:
 426             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 427
 428         # title
 429         if 'title' not in video_info:
 430             self._downloader.trouble(u'ERROR: unable to extract video title')
 431             return
 432         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 433
 434         # thumbnail image
 435         if 'thumbnail_url' not in video_info:
 436             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 437             video_thumbnail = ''
 438         else:   # don't panic if we can't find it
 439             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 440
 441         # upload date
 442         upload_date = None
 443         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 444         if mobj is not None:
 445             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 446             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 447             for expression in format_expressions:
 448                 try:
 449                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 450                 except:
 451                     pass
 452
 453         # description
 454         video_description = get_element_by_id("eow-description", video_webpage)
 455         if video_description:
 456             video_description = clean_html(video_description)
 457         else:
 458             video_description = ''
 459
 460         # closed captions
 461         video_subtitles = None
 462         if self._downloader.params.get('writesubtitles', False):
 463             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 464             if srt_error:
 465                 self._downloader.trouble(srt_error)
 466
 467         if 'length_seconds' not in video_info:
 468             self._downloader.trouble(u'WARNING: unable to extract video duration')
 469             video_duration = ''
 470         else:
 471             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 472
 473         # token
 474         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 475
 476         # Decide which formats to download
 477         req_format = self._downloader.params.get('format', None)
 478
 479         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 480             self.report_rtmp_download()
 481             video_url_list = [(None, video_info['conn'][0])]
 482         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 483             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 484             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 485             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 486             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 487
 488             format_limit = self._downloader.params.get('format_limit', None)
 489             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 490             if format_limit is not None and format_limit in available_formats:
 491                 format_list = available_formats[available_formats.index(format_limit):]
 492             else:
 493                 format_list = available_formats
 494             existing_formats = [x for x in format_list if x in url_map]
 495             if len(existing_formats) == 0:
 496                 self._downloader.trouble(u'ERROR: no known formats available for video')
 497                 return
 498             if self._downloader.params.get('listformats', None):
 499                 self._print_formats(existing_formats)
 500                 return
 501             if req_format is None or req_format == 'best':
 502                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 503             elif req_format == 'worst':
 504                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 505             elif req_format in ('-1', 'all'):
 506                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 507             else:
 508                 # Specific formats. We pick the first in a slash-delimeted sequence.
 509                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 510                 req_formats = req_format.split('/')
 511                 video_url_list = None
 512                 for rf in req_formats:
 513                     if rf in url_map:
 514                         video_url_list = [(rf, url_map[rf])]
 515                         break
 516                 if video_url_list is None:
 517                     self._downloader.trouble(u'ERROR: requested format not available')
 518                     return
 519         else:
 520             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 521             return
 522
 523         results = []
 524         for format_param, video_real_url in video_url_list:
 525             # Extension
 526             video_extension = self._video_extensions.get(format_param, 'flv')
 527
 528             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 529                                               self._video_dimensions.get(format_param, '???'))
 530
 531             results.append({
 532                 'id':       video_id,
 533                 'url':      video_real_url,
 534                 'uploader': video_uploader,
 535                 'uploader_id': video_uploader_id,
 536                 'upload_date':  upload_date,
 537                 'title':    video_title,
 538                 'ext':      video_extension,
 539                 'format':   video_format,
 540                 'thumbnail':    video_thumbnail,
 541                 'description':  video_description,
 542                 'player_url':   player_url,
 543                 'subtitles':    video_subtitles,
 544                 'duration':     video_duration
 545             })
 546         return results
 547
 548
 549 class MetacafeIE(InfoExtractor):
 550     """Information Extractor for metacafe.com."""
 551
 552     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 553     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 554     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 555     IE_NAME = u'metacafe'
 556
 557     def __init__(self, downloader=None):
 558         InfoExtractor.__init__(self, downloader)
 559
 560     def report_disclaimer(self):
 561         """Report disclaimer retrieval."""
 562         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 563
 564     def report_age_confirmation(self):
 565         """Report attempt to confirm age."""
 566         self._downloader.to_screen(u'[metacafe] Confirming age')
 567
 568     def report_download_webpage(self, video_id):
 569         """Report webpage download."""
 570         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 571
 572     def report_extraction(self, video_id):
 573         """Report information extraction."""
 574         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 575
 576     def _real_initialize(self):
 577         # Retrieve disclaimer
 578         request = compat_urllib_request.Request(self._DISCLAIMER)
 579         try:
 580             self.report_disclaimer()
 581             disclaimer = compat_urllib_request.urlopen(request).read()
 582         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 583             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 584             return
 585
 586         # Confirm age
 587         disclaimer_form = {
 588             'filters': '0',
 589             'submit': "Continue - I'm over 18",
 590             }
 591         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 592         try:
 593             self.report_age_confirmation()
 594             disclaimer = compat_urllib_request.urlopen(request).read()
 595         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 596             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 597             return
 598
 599     def _real_extract(self, url):
 600         # Extract id and simplified title from URL
 601         mobj = re.match(self._VALID_URL, url)
 602         if mobj is None:
 603             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 604             return
 605
 606         video_id = mobj.group(1)
 607
 608         # Check if video comes from YouTube
 609         mobj2 = re.match(r'^yt-(.*)$', video_id)
 610         if mobj2 is not None:
 611             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 612             return
 613
 614         # Retrieve video webpage to extract further information
 615         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 616         try:
 617             self.report_download_webpage(video_id)
 618             webpage = compat_urllib_request.urlopen(request).read()
 619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 620             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 621             return
 622
 623         # Extract URL, uploader and title from webpage
 624         self.report_extraction(video_id)
 625         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 626         if mobj is not None:
 627             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 628             video_extension = mediaURL[-3:]
 629
 630             # Extract gdaKey if available
 631             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 632             if mobj is None:
 633                 video_url = mediaURL
 634             else:
 635                 gdaKey = mobj.group(1)
 636                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 637         else:
 638             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 639             if mobj is None:
 640                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 641                 return
 642             vardict = compat_parse_qs(mobj.group(1))
 643             if 'mediaData' not in vardict:
 644                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 645                 return
 646             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 647             if mobj is None:
 648                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 649                 return
 650             mediaURL = mobj.group(1).replace('\\/', '/')
 651             video_extension = mediaURL[-3:]
 652             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 653
 654         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 655         if mobj is None:
 656             self._downloader.trouble(u'ERROR: unable to extract title')
 657             return
 658         video_title = mobj.group(1).decode('utf-8')
 659
 660         mobj = re.search(r'submitter=(.*?);', webpage)
 661         if mobj is None:
 662             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 663             return
 664         video_uploader = mobj.group(1)
 665
 666         return [{
 667             'id':       video_id.decode('utf-8'),
 668             'url':      video_url.decode('utf-8'),
 669             'uploader': video_uploader.decode('utf-8'),
 670             'upload_date':  None,
 671             'title':    video_title,
 672             'ext':      video_extension.decode('utf-8'),
 673         }]
 674
 675
 676 class DailymotionIE(InfoExtractor):
 677     """Information Extractor for Dailymotion"""
 678
 679     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 680     IE_NAME = u'dailymotion'
 681
 682     def __init__(self, downloader=None):
 683         InfoExtractor.__init__(self, downloader)
 684
 685     def report_extraction(self, video_id):
 686         """Report information extraction."""
 687         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 688
 689     def _real_extract(self, url):
 690         # Extract id and simplified title from URL
 691         mobj = re.match(self._VALID_URL, url)
 692         if mobj is None:
 693             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 694             return
 695
 696         video_id = mobj.group(1).split('_')[0].split('?')[0]
 697
 698         video_extension = 'mp4'
 699
 700         # Retrieve video webpage to extract further information
 701         request = compat_urllib_request.Request(url)
 702         request.add_header('Cookie', 'family_filter=off')
 703         webpage = self._download_webpage(request, video_id)
 704
 705         # Extract URL, uploader and title from webpage
 706         self.report_extraction(video_id)
 707         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 708         if mobj is None:
 709             self._downloader.trouble(u'ERROR: unable to extract media URL')
 710             return
 711         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 712
 713         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 714             if key in flashvars:
 715                 max_quality = key
 716                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 717                 break
 718         else:
 719             self._downloader.trouble(u'ERROR: unable to extract video URL')
 720             return
 721
 722         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 723         if mobj is None:
 724             self._downloader.trouble(u'ERROR: unable to extract video URL')
 725             return
 726
 727         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 728
 729         # TODO: support choosing qualities
 730
 731         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 732         if mobj is None:
 733             self._downloader.trouble(u'ERROR: unable to extract title')
 734             return
 735         video_title = unescapeHTML(mobj.group('title'))
 736
 737         video_uploader = None
 738         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 739         if mobj is None:
 740             # lookin for official user
 741             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 742             if mobj_official is None:
 743                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 744             else:
 745                 video_uploader = mobj_official.group(1)
 746         else:
 747             video_uploader = mobj.group(1)
 748
 749         video_upload_date = None
 750         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 751         if mobj is not None:
 752             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 753
 754         return [{
 755             'id':       video_id,
 756             'url':      video_url,
 757             'uploader': video_uploader,
 758             'upload_date':  video_upload_date,
 759             'title':    video_title,
 760             'ext':      video_extension,
 761         }]
 762
 763
 764 class PhotobucketIE(InfoExtractor):
 765     """Information extractor for photobucket.com."""
 766
 767     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 768     IE_NAME = u'photobucket'
 769
 770     def __init__(self, downloader=None):
 771         InfoExtractor.__init__(self, downloader)
 772
 773     def report_download_webpage(self, video_id):
 774         """Report webpage download."""
 775         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 776
 777     def report_extraction(self, video_id):
 778         """Report information extraction."""
 779         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 780
 781     def _real_extract(self, url):
 782         # Extract id from URL
 783         mobj = re.match(self._VALID_URL, url)
 784         if mobj is None:
 785             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 786             return
 787
 788         video_id = mobj.group(1)
 789
 790         video_extension = 'flv'
 791
 792         # Retrieve video webpage to extract further information
 793         request = compat_urllib_request.Request(url)
 794         try:
 795             self.report_download_webpage(video_id)
 796             webpage = compat_urllib_request.urlopen(request).read()
 797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 798             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 799             return
 800
 801         # Extract URL, uploader, and title from webpage
 802         self.report_extraction(video_id)
 803         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 804         if mobj is None:
 805             self._downloader.trouble(u'ERROR: unable to extract media URL')
 806             return
 807         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 808
 809         video_url = mediaURL
 810
 811         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 812         if mobj is None:
 813             self._downloader.trouble(u'ERROR: unable to extract title')
 814             return
 815         video_title = mobj.group(1).decode('utf-8')
 816
 817         video_uploader = mobj.group(2).decode('utf-8')
 818
 819         return [{
 820             'id':       video_id.decode('utf-8'),
 821             'url':      video_url.decode('utf-8'),
 822             'uploader': video_uploader,
 823             'upload_date':  None,
 824             'title':    video_title,
 825             'ext':      video_extension.decode('utf-8'),
 826         }]
 827
 828
 829 class YahooIE(InfoExtractor):
 830     """Information extractor for video.yahoo.com."""
 831
 832     _WORKING = False
 833     # _VALID_URL matches all Yahoo! Video URLs
 834     # _VPAGE_URL matches only the extractable '/watch/' URLs
 835     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 836     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 837     IE_NAME = u'video.yahoo'
 838
 839     def __init__(self, downloader=None):
 840         InfoExtractor.__init__(self, downloader)
 841
 842     def report_download_webpage(self, video_id):
 843         """Report webpage download."""
 844         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 845
 846     def report_extraction(self, video_id):
 847         """Report information extraction."""
 848         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 849
 850     def _real_extract(self, url, new_video=True):
 851         # Extract ID from URL
 852         mobj = re.match(self._VALID_URL, url)
 853         if mobj is None:
 854             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 855             return
 856
 857         video_id = mobj.group(2)
 858         video_extension = 'flv'
 859
 860         # Rewrite valid but non-extractable URLs as
 861         # extractable English language /watch/ URLs
 862         if re.match(self._VPAGE_URL, url) is None:
 863             request = compat_urllib_request.Request(url)
 864             try:
 865                 webpage = compat_urllib_request.urlopen(request).read()
 866             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 867                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 868                 return
 869
 870             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 871             if mobj is None:
 872                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 873                 return
 874             yahoo_id = mobj.group(1)
 875
 876             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 877             if mobj is None:
 878                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 879                 return
 880             yahoo_vid = mobj.group(1)
 881
 882             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 883             return self._real_extract(url, new_video=False)
 884
 885         # Retrieve video webpage to extract further information
 886         request = compat_urllib_request.Request(url)
 887         try:
 888             self.report_download_webpage(video_id)
 889             webpage = compat_urllib_request.urlopen(request).read()
 890         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 891             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 892             return
 893
 894         # Extract uploader and title from webpage
 895         self.report_extraction(video_id)
 896         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 897         if mobj is None:
 898             self._downloader.trouble(u'ERROR: unable to extract video title')
 899             return
 900         video_title = mobj.group(1).decode('utf-8')
 901
 902         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 903         if mobj is None:
 904             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 905             return
 906         video_uploader = mobj.group(1).decode('utf-8')
 907
 908         # Extract video thumbnail
 909         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 910         if mobj is None:
 911             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 912             return
 913         video_thumbnail = mobj.group(1).decode('utf-8')
 914
 915         # Extract video description
 916         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 917         if mobj is None:
 918             self._downloader.trouble(u'ERROR: unable to extract video description')
 919             return
 920         video_description = mobj.group(1).decode('utf-8')
 921         if not video_description:
 922             video_description = 'No description available.'
 923
 924         # Extract video height and width
 925         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 926         if mobj is None:
 927             self._downloader.trouble(u'ERROR: unable to extract video height')
 928             return
 929         yv_video_height = mobj.group(1)
 930
 931         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 932         if mobj is None:
 933             self._downloader.trouble(u'ERROR: unable to extract video width')
 934             return
 935         yv_video_width = mobj.group(1)
 936
 937         # Retrieve video playlist to extract media URL
 938         # I'm not completely sure what all these options are, but we
 939         # seem to need most of them, otherwise the server sends a 401.
 940         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 941         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 942         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 943                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 944                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 945         try:
 946             self.report_download_webpage(video_id)
 947             webpage = compat_urllib_request.urlopen(request).read()
 948         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 949             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 950             return
 951
 952         # Extract media URL from playlist XML
 953         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 954         if mobj is None:
 955             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 956             return
 957         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 958         video_url = unescapeHTML(video_url)
 959
 960         return [{
 961             'id':       video_id.decode('utf-8'),
 962             'url':      video_url,
 963             'uploader': video_uploader,
 964             'upload_date':  None,
 965             'title':    video_title,
 966             'ext':      video_extension.decode('utf-8'),
 967             'thumbnail':    video_thumbnail.decode('utf-8'),
 968             'description':  video_description,
 969         }]
 970
 971
 972 class VimeoIE(InfoExtractor):
 973     """Information extractor for vimeo.com."""
 974
 975     # _VALID_URL matches Vimeo URLs
 976     _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
 977     IE_NAME = u'vimeo'
 978
 979     def __init__(self, downloader=None):
 980         InfoExtractor.__init__(self, downloader)
 981
 982     def report_download_webpage(self, video_id):
 983         """Report webpage download."""
 984         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
 985
 986     def report_extraction(self, video_id):
 987         """Report information extraction."""
 988         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
 989
 990     def _real_extract(self, url, new_video=True):
 991         # Extract ID from URL
 992         mobj = re.match(self._VALID_URL, url)
 993         if mobj is None:
 994             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 995             return
 996
 997         video_id = mobj.group(1)
 998
 999         # Retrieve video webpage to extract further information
1000         request = compat_urllib_request.Request(url, None, std_headers)
1001         try:
1002             self.report_download_webpage(video_id)
1003             webpage_bytes = compat_urllib_request.urlopen(request).read()
1004             webpage = webpage_bytes.decode('utf-8')
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Now we begin extracting as much information as we can from what we
1010         # retrieved. First we extract the information common to all extractors,
1011         # and latter we extract those that are Vimeo specific.
1012         self.report_extraction(video_id)
1013
1014         # Extract the config JSON
1015         try:
1016             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017             config = json.loads(config)
1018         except:
1019             self._downloader.trouble(u'ERROR: unable to extract info section')
1020             return
1021
1022         # Extract title
1023         video_title = config["video"]["title"]
1024
1025         # Extract uploader and uploader_id
1026         video_uploader = config["video"]["owner"]["name"]
1027         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1028
1029         # Extract video thumbnail
1030         video_thumbnail = config["video"]["thumbnail"]
1031
1032         # Extract video description
1033         video_description = get_element_by_attribute("itemprop", "description", webpage)
1034         if video_description: video_description = clean_html(video_description)
1035         else: video_description = ''
1036
1037         # Extract upload date
1038         video_upload_date = None
1039         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1040         if mobj is not None:
1041             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1042
1043         # Vimeo specific: extract request signature and timestamp
1044         sig = config['request']['signature']
1045         timestamp = config['request']['timestamp']
1046
1047         # Vimeo specific: extract video codec and quality information
1048         # First consider quality, then codecs, then take everything
1049         # TODO bind to format param
1050         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1051         files = { 'hd': [], 'sd': [], 'other': []}
1052         for codec_name, codec_extension in codecs:
1053             if codec_name in config["video"]["files"]:
1054                 if 'hd' in config["video"]["files"][codec_name]:
1055                     files['hd'].append((codec_name, codec_extension, 'hd'))
1056                 elif 'sd' in config["video"]["files"][codec_name]:
1057                     files['sd'].append((codec_name, codec_extension, 'sd'))
1058                 else:
1059                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1060
1061         for quality in ('hd', 'sd', 'other'):
1062             if len(files[quality]) > 0:
1063                 video_quality = files[quality][0][2]
1064                 video_codec = files[quality][0][0]
1065                 video_extension = files[quality][0][1]
1066                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1067                 break
1068         else:
1069             self._downloader.trouble(u'ERROR: no known codec found')
1070             return
1071
1072         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1073                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1074
1075         return [{
1076             'id':       video_id,
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'uploader_id': video_uploader_id,
1080             'upload_date':  video_upload_date,
1081             'title':    video_title,
1082             'ext':      video_extension,
1083             'thumbnail':    video_thumbnail,
1084             'description':  video_description,
1085         }]
1086
1087
1088 class ArteTvIE(InfoExtractor):
1089     """arte.tv information extractor."""
1090
1091     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1092     _LIVE_URL = r'index-[0-9]+\.html$'
1093
1094     IE_NAME = u'arte.tv'
1095
1096     def __init__(self, downloader=None):
1097         InfoExtractor.__init__(self, downloader)
1098
1099     def report_download_webpage(self, video_id):
1100         """Report webpage download."""
1101         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1102
1103     def report_extraction(self, video_id):
1104         """Report information extraction."""
1105         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1106
1107     def fetch_webpage(self, url):
1108         request = compat_urllib_request.Request(url)
1109         try:
1110             self.report_download_webpage(url)
1111             webpage = compat_urllib_request.urlopen(request).read()
1112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1113             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1114             return
1115         except ValueError as err:
1116             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1117             return
1118         return webpage
1119
1120     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1121         page = self.fetch_webpage(url)
1122         mobj = re.search(regex, page, regexFlags)
1123         info = {}
1124
1125         if mobj is None:
1126             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1127             return
1128
1129         for (i, key, err) in matchTuples:
1130             if mobj.group(i) is None:
1131                 self._downloader.trouble(err)
1132                 return
1133             else:
1134                 info[key] = mobj.group(i)
1135
1136         return info
1137
1138     def extractLiveStream(self, url):
1139         video_lang = url.split('/')[-4]
1140         info = self.grep_webpage(
1141             url,
1142             r'src="(.*?/videothek_js.*?\.js)',
1143             0,
1144             [
1145                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1146             ]
1147         )
1148         http_host = url.split('/')[2]
1149         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1150         info = self.grep_webpage(
1151             next_url,
1152             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1153                 '(http://.*?\.swf).*?' +
1154                 '(rtmp://.*?)\'',
1155             re.DOTALL,
1156             [
1157                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1158                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1159                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1160             ]
1161         )
1162         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1163
1164     def extractPlus7Stream(self, url):
1165         video_lang = url.split('/')[-3]
1166         info = self.grep_webpage(
1167             url,
1168             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1169             0,
1170             [
1171                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1172             ]
1173         )
1174         next_url = compat_urllib_parse.unquote(info.get('url'))
1175         info = self.grep_webpage(
1176             next_url,
1177             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1178             0,
1179             [
1180                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1181             ]
1182         )
1183         next_url = compat_urllib_parse.unquote(info.get('url'))
1184
1185         info = self.grep_webpage(
1186             next_url,
1187             r'<video id="(.*?)".*?>.*?' +
1188                 '<name>(.*?)</name>.*?' +
1189                 '<dateVideo>(.*?)</dateVideo>.*?' +
1190                 '<url quality="hd">(.*?)</url>',
1191             re.DOTALL,
1192             [
1193                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1194                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1195                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1196                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1197             ]
1198         )
1199
1200         return {
1201             'id':           info.get('id'),
1202             'url':          compat_urllib_parse.unquote(info.get('url')),
1203             'uploader':     u'arte.tv',
1204             'upload_date':  info.get('date'),
1205             'title':        info.get('title').decode('utf-8'),
1206             'ext':          u'mp4',
1207             'format':       u'NA',
1208             'player_url':   None,
1209         }
1210
1211     def _real_extract(self, url):
1212         video_id = url.split('/')[-1]
1213         self.report_extraction(video_id)
1214
1215         if re.search(self._LIVE_URL, video_id) is not None:
1216             self.extractLiveStream(url)
1217             return
1218         else:
1219             info = self.extractPlus7Stream(url)
1220
1221         return [info]
1222
1223
1224 class GenericIE(InfoExtractor):
1225     """Generic last-resort information extractor."""
1226
1227     _VALID_URL = r'.*'
1228     IE_NAME = u'generic'
1229
1230     def __init__(self, downloader=None):
1231         InfoExtractor.__init__(self, downloader)
1232
1233     def report_download_webpage(self, video_id):
1234         """Report webpage download."""
1235         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1236         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1237
1238     def report_extraction(self, video_id):
1239         """Report information extraction."""
1240         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1241
1242     def report_following_redirect(self, new_url):
1243         """Report information extraction."""
1244         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1245
1246     def _test_redirect(self, url):
1247         """Check if it is a redirect, like url shorteners, in case restart chain."""
1248         class HeadRequest(compat_urllib_request.Request):
1249             def get_method(self):
1250                 return "HEAD"
1251
1252         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1253             """
1254             Subclass the HTTPRedirectHandler to make it use our
1255             HeadRequest also on the redirected URL
1256             """
1257             def redirect_request(self, req, fp, code, msg, headers, newurl):
1258                 if code in (301, 302, 303, 307):
1259                     newurl = newurl.replace(' ', '%20')
1260                     newheaders = dict((k,v) for k,v in req.headers.items()
1261                                       if k.lower() not in ("content-length", "content-type"))
1262                     return HeadRequest(newurl,
1263                                        headers=newheaders,
1264                                        origin_req_host=req.get_origin_req_host(),
1265                                        unverifiable=True)
1266                 else:
1267                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1268
1269         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1270             """
1271             Fallback to GET if HEAD is not allowed (405 HTTP error)
1272             """
1273             def http_error_405(self, req, fp, code, msg, headers):
1274                 fp.read()
1275                 fp.close()
1276
1277                 newheaders = dict((k,v) for k,v in req.headers.items()
1278                                   if k.lower() not in ("content-length", "content-type"))
1279                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1280                                                  headers=newheaders,
1281                                                  origin_req_host=req.get_origin_req_host(),
1282                                                  unverifiable=True))
1283
1284         # Build our opener
1285         opener = compat_urllib_request.OpenerDirector()
1286         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1287                         HTTPMethodFallback, HEADRedirectHandler,
1288                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1289             opener.add_handler(handler())
1290
1291         response = opener.open(HeadRequest(url))
1292         new_url = response.geturl()
1293
1294         if url == new_url:
1295             return False
1296
1297         self.report_following_redirect(new_url)
1298         self._downloader.download([new_url])
1299         return True
1300
1301     def _real_extract(self, url):
1302         if self._test_redirect(url): return
1303
1304         video_id = url.split('/')[-1]
1305         request = compat_urllib_request.Request(url)
1306         try:
1307             self.report_download_webpage(video_id)
1308             webpage = compat_urllib_request.urlopen(request).read()
1309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1310             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1311             return
1312         except ValueError as err:
1313             # since this is the last-resort InfoExtractor, if
1314             # this error is thrown, it'll be thrown here
1315             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1316             return
1317
1318         self.report_extraction(video_id)
1319         # Start with something easy: JW Player in SWFObject
1320         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1321         if mobj is None:
1322             # Broaden the search a little bit
1323             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1324         if mobj is None:
1325             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1326             return
1327
1328         # It's possible that one of the regexes
1329         # matched, but returned an empty group:
1330         if mobj.group(1) is None:
1331             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1332             return
1333
1334         video_url = compat_urllib_parse.unquote(mobj.group(1))
1335         video_id = os.path.basename(video_url)
1336
1337         # here's a fun little line of code for you:
1338         video_extension = os.path.splitext(video_id)[1][1:]
1339         video_id = os.path.splitext(video_id)[0]
1340
1341         # it's tempting to parse this further, but you would
1342         # have to take into account all the variations like
1343         #   Video Title - Site Name
1344         #   Site Name | Video Title
1345         #   Video Title - Tagline | Site Name
1346         # and so on and so forth; it's just not practical
1347         mobj = re.search(r'<title>(.*)</title>', webpage)
1348         if mobj is None:
1349             self._downloader.trouble(u'ERROR: unable to extract title')
1350             return
1351         video_title = mobj.group(1)
1352
1353         # video uploader is domain name
1354         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1355         if mobj is None:
1356             self._downloader.trouble(u'ERROR: unable to extract title')
1357             return
1358         video_uploader = mobj.group(1)
1359
1360         return [{
1361             'id':       video_id,
1362             'url':      video_url,
1363             'uploader': video_uploader,
1364             'upload_date':  None,
1365             'title':    video_title,
1366             'ext':      video_extension,
1367         }]
1368
1369
1370 class YoutubeSearchIE(InfoExtractor):
1371     """Information Extractor for YouTube search queries."""
1372     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1373     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1374     _max_youtube_results = 1000
1375     IE_NAME = u'youtube:search'
1376
1377     def __init__(self, downloader=None):
1378         InfoExtractor.__init__(self, downloader)
1379
1380     def report_download_page(self, query, pagenum):
1381         """Report attempt to download search page with given number."""
1382         query = query.decode(preferredencoding())
1383         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1384
1385     def _real_extract(self, query):
1386         mobj = re.match(self._VALID_URL, query)
1387         if mobj is None:
1388             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1389             return
1390
1391         prefix, query = query.split(':')
1392         prefix = prefix[8:]
1393         query = query.encode('utf-8')
1394         if prefix == '':
1395             self._download_n_results(query, 1)
1396             return
1397         elif prefix == 'all':
1398             self._download_n_results(query, self._max_youtube_results)
1399             return
1400         else:
1401             try:
1402                 n = int(prefix)
1403                 if n <= 0:
1404                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1405                     return
1406                 elif n > self._max_youtube_results:
1407                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1408                     n = self._max_youtube_results
1409                 self._download_n_results(query, n)
1410                 return
1411             except ValueError: # parsing prefix as integer fails
1412                 self._download_n_results(query, 1)
1413                 return
1414
1415     def _download_n_results(self, query, n):
1416         """Downloads a specified number of results for a query"""
1417
1418         video_ids = []
1419         pagenum = 0
1420         limit = n
1421
1422         while (50 * pagenum) < limit:
1423             self.report_download_page(query, pagenum+1)
1424             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1425             request = compat_urllib_request.Request(result_url)
1426             try:
1427                 data = compat_urllib_request.urlopen(request).read()
1428             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1429                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1430                 return
1431             api_response = json.loads(data)['data']
1432
1433             new_ids = list(video['id'] for video in api_response['items'])
1434             video_ids += new_ids
1435
1436             limit = min(n, api_response['totalItems'])
1437             pagenum += 1
1438
1439         if len(video_ids) > n:
1440             video_ids = video_ids[:n]
1441         for id in video_ids:
1442             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1443         return
1444
1445
1446 class GoogleSearchIE(InfoExtractor):
1447     """Information Extractor for Google Video search queries."""
1448     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1449     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1450     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1451     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1452     _max_google_results = 1000
1453     IE_NAME = u'video.google:search'
1454
1455     def __init__(self, downloader=None):
1456         InfoExtractor.__init__(self, downloader)
1457
1458     def report_download_page(self, query, pagenum):
1459         """Report attempt to download playlist page with given number."""
1460         query = query.decode(preferredencoding())
1461         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1462
1463     def _real_extract(self, query):
1464         mobj = re.match(self._VALID_URL, query)
1465         if mobj is None:
1466             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1467             return
1468
1469         prefix, query = query.split(':')
1470         prefix = prefix[8:]
1471         query = query.encode('utf-8')
1472         if prefix == '':
1473             self._download_n_results(query, 1)
1474             return
1475         elif prefix == 'all':
1476             self._download_n_results(query, self._max_google_results)
1477             return
1478         else:
1479             try:
1480                 n = int(prefix)
1481                 if n <= 0:
1482                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1483                     return
1484                 elif n > self._max_google_results:
1485                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1486                     n = self._max_google_results
1487                 self._download_n_results(query, n)
1488                 return
1489             except ValueError: # parsing prefix as integer fails
1490                 self._download_n_results(query, 1)
1491                 return
1492
1493     def _download_n_results(self, query, n):
1494         """Downloads a specified number of results for a query"""
1495
1496         video_ids = []
1497         pagenum = 0
1498
1499         while True:
1500             self.report_download_page(query, pagenum)
1501             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1502             request = compat_urllib_request.Request(result_url)
1503             try:
1504                 page = compat_urllib_request.urlopen(request).read()
1505             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1506                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1507                 return
1508
1509             # Extract video identifiers
1510             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1511                 video_id = mobj.group(1)
1512                 if video_id not in video_ids:
1513                     video_ids.append(video_id)
1514                     if len(video_ids) == n:
1515                         # Specified n videos reached
1516                         for id in video_ids:
1517                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1518                         return
1519
1520             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1521                 for id in video_ids:
1522                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1523                 return
1524
1525             pagenum = pagenum + 1
1526
1527
1528 class YahooSearchIE(InfoExtractor):
1529     """Information Extractor for Yahoo! Video search queries."""
1530
1531     _WORKING = False
1532     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1533     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1534     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1535     _MORE_PAGES_INDICATOR = r'\s*Next'
1536     _max_yahoo_results = 1000
1537     IE_NAME = u'video.yahoo:search'
1538
1539     def __init__(self, downloader=None):
1540         InfoExtractor.__init__(self, downloader)
1541
1542     def report_download_page(self, query, pagenum):
1543         """Report attempt to download playlist page with given number."""
1544         query = query.decode(preferredencoding())
1545         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1546
1547     def _real_extract(self, query):
1548         mobj = re.match(self._VALID_URL, query)
1549         if mobj is None:
1550             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1551             return
1552
1553         prefix, query = query.split(':')
1554         prefix = prefix[8:]
1555         query = query.encode('utf-8')
1556         if prefix == '':
1557             self._download_n_results(query, 1)
1558             return
1559         elif prefix == 'all':
1560             self._download_n_results(query, self._max_yahoo_results)
1561             return
1562         else:
1563             try:
1564                 n = int(prefix)
1565                 if n <= 0:
1566                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1567                     return
1568                 elif n > self._max_yahoo_results:
1569                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1570                     n = self._max_yahoo_results
1571                 self._download_n_results(query, n)
1572                 return
1573             except ValueError: # parsing prefix as integer fails
1574                 self._download_n_results(query, 1)
1575                 return
1576
1577     def _download_n_results(self, query, n):
1578         """Downloads a specified number of results for a query"""
1579
1580         video_ids = []
1581         already_seen = set()
1582         pagenum = 1
1583
1584         while True:
1585             self.report_download_page(query, pagenum)
1586             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1587             request = compat_urllib_request.Request(result_url)
1588             try:
1589                 page = compat_urllib_request.urlopen(request).read()
1590             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1591                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1592                 return
1593
1594             # Extract video identifiers
1595             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1596                 video_id = mobj.group(1)
1597                 if video_id not in already_seen:
1598                     video_ids.append(video_id)
1599                     already_seen.add(video_id)
1600                     if len(video_ids) == n:
1601                         # Specified n videos reached
1602                         for id in video_ids:
1603                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1604                         return
1605
1606             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1607                 for id in video_ids:
1608                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1609                 return
1610
1611             pagenum = pagenum + 1
1612
1613
1614 class YoutubePlaylistIE(InfoExtractor):
1615     """Information Extractor for YouTube playlists."""
1616
1617     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1618     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1619     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1620     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1621     IE_NAME = u'youtube:playlist'
1622
1623     def __init__(self, downloader=None):
1624         InfoExtractor.__init__(self, downloader)
1625
1626     def report_download_page(self, playlist_id, pagenum):
1627         """Report attempt to download playlist page with given number."""
1628         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1629
1630     def _real_extract(self, url):
1631         # Extract playlist id
1632         mobj = re.match(self._VALID_URL, url)
1633         if mobj is None:
1634             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1635             return
1636
1637         # Single video case
1638         if mobj.group(3) is not None:
1639             self._downloader.download([mobj.group(3)])
1640             return
1641
1642         # Download playlist pages
1643         # prefix is 'p' as default for playlists but there are other types that need extra care
1644         playlist_prefix = mobj.group(1)
1645         if playlist_prefix == 'a':
1646             playlist_access = 'artist'
1647         else:
1648             playlist_prefix = 'p'
1649             playlist_access = 'view_play_list'
1650         playlist_id = mobj.group(2)
1651         video_ids = []
1652         pagenum = 1
1653
1654         while True:
1655             self.report_download_page(playlist_id, pagenum)
1656             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1657             request = compat_urllib_request.Request(url)
1658             try:
1659                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1660             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1661                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1662                 return
1663
1664             # Extract video identifiers
1665             ids_in_page = []
1666             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1667                 if mobj.group(1) not in ids_in_page:
1668                     ids_in_page.append(mobj.group(1))
1669             video_ids.extend(ids_in_page)
1670
1671             if self._MORE_PAGES_INDICATOR not in page:
1672                 break
1673             pagenum = pagenum + 1
1674
1675         total = len(video_ids)
1676
1677         playliststart = self._downloader.params.get('playliststart', 1) - 1
1678         playlistend = self._downloader.params.get('playlistend', -1)
1679         if playlistend == -1:
1680             video_ids = video_ids[playliststart:]
1681         else:
1682             video_ids = video_ids[playliststart:playlistend]
1683
1684         if len(video_ids) == total:
1685             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1686         else:
1687             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1688
1689         for id in video_ids:
1690             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1691         return
1692
1693
1694 class YoutubeChannelIE(InfoExtractor):
1695     """Information Extractor for YouTube channels."""
1696
1697     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1698     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1699     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1700     IE_NAME = u'youtube:channel'
1701
1702     def report_download_page(self, channel_id, pagenum):
1703         """Report attempt to download channel page with given number."""
1704         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1705
1706     def _real_extract(self, url):
1707         # Extract channel id
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1711             return
1712
1713         # Download channel pages
1714         channel_id = mobj.group(1)
1715         video_ids = []
1716         pagenum = 1
1717
1718         while True:
1719             self.report_download_page(channel_id, pagenum)
1720             url = self._TEMPLATE_URL % (channel_id, pagenum)
1721             request = compat_urllib_request.Request(url)
1722             try:
1723                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1724             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1726                 return
1727
1728             # Extract video identifiers
1729             ids_in_page = []
1730             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1731                 if mobj.group(1) not in ids_in_page:
1732                     ids_in_page.append(mobj.group(1))
1733             video_ids.extend(ids_in_page)
1734
1735             if self._MORE_PAGES_INDICATOR not in page:
1736                 break
1737             pagenum = pagenum + 1
1738
1739         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1740
1741         for id in video_ids:
1742             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1743         return
1744
1745
1746 class YoutubeUserIE(InfoExtractor):
1747     """Information Extractor for YouTube users."""
1748
1749     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1750     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1751     _GDATA_PAGE_SIZE = 50
1752     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1753     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1754     IE_NAME = u'youtube:user'
1755
1756     def __init__(self, downloader=None):
1757         InfoExtractor.__init__(self, downloader)
1758
1759     def report_download_page(self, username, start_index):
1760         """Report attempt to download user page."""
1761         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1762                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1763
1764     def _real_extract(self, url):
1765         # Extract username
1766         mobj = re.match(self._VALID_URL, url)
1767         if mobj is None:
1768             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1769             return
1770
1771         username = mobj.group(1)
1772
1773         # Download video ids using YouTube Data API. Result size per
1774         # query is limited (currently to 50 videos) so we need to query
1775         # page by page until there are no video ids - it means we got
1776         # all of them.
1777
1778         video_ids = []
1779         pagenum = 0
1780
1781         while True:
1782             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1783             self.report_download_page(username, start_index)
1784
1785             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1786
1787             try:
1788                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1789             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1790                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1791                 return
1792
1793             # Extract video identifiers
1794             ids_in_page = []
1795
1796             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1797                 if mobj.group(1) not in ids_in_page:
1798                     ids_in_page.append(mobj.group(1))
1799
1800             video_ids.extend(ids_in_page)
1801
1802             # A little optimization - if current page is not
1803             # "full", ie. does not contain PAGE_SIZE video ids then
1804             # we can assume that this page is the last one - there
1805             # are no more ids on further pages - no need to query
1806             # again.
1807
1808             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1809                 break
1810
1811             pagenum += 1
1812
1813         all_ids_count = len(video_ids)
1814         playliststart = self._downloader.params.get('playliststart', 1) - 1
1815         playlistend = self._downloader.params.get('playlistend', -1)
1816
1817         if playlistend == -1:
1818             video_ids = video_ids[playliststart:]
1819         else:
1820             video_ids = video_ids[playliststart:playlistend]
1821
1822         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1823                 (username, all_ids_count, len(video_ids)))
1824
1825         for video_id in video_ids:
1826             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1827
1828
1829 class BlipTVUserIE(InfoExtractor):
1830     """Information Extractor for blip.tv users."""
1831
1832     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1833     _PAGE_SIZE = 12
1834     IE_NAME = u'blip.tv:user'
1835
1836     def __init__(self, downloader=None):
1837         InfoExtractor.__init__(self, downloader)
1838
1839     def report_download_page(self, username, pagenum):
1840         """Report attempt to download user page."""
1841         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1842                 (self.IE_NAME, username, pagenum))
1843
1844     def _real_extract(self, url):
1845         # Extract username
1846         mobj = re.match(self._VALID_URL, url)
1847         if mobj is None:
1848             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1849             return
1850
1851         username = mobj.group(1)
1852
1853         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1854
1855         request = compat_urllib_request.Request(url)
1856
1857         try:
1858             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             mobj = re.search(r'data-users-id="([^"]+)"', page)
1860             page_base = page_base % mobj.group(1)
1861         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1863             return
1864
1865
1866         # Download video ids using BlipTV Ajax calls. Result size per
1867         # query is limited (currently to 12 videos) so we need to query
1868         # page by page until there are no video ids - it means we got
1869         # all of them.
1870
1871         video_ids = []
1872         pagenum = 1
1873
1874         while True:
1875             self.report_download_page(username, pagenum)
1876
1877             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1878
1879             try:
1880                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1881             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1882                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1883                 return
1884
1885             # Extract video identifiers
1886             ids_in_page = []
1887
1888             for mobj in re.finditer(r'href="/([^"]+)"', page):
1889                 if mobj.group(1) not in ids_in_page:
1890                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1891
1892             video_ids.extend(ids_in_page)
1893
1894             # A little optimization - if current page is not
1895             # "full", ie. does not contain PAGE_SIZE video ids then
1896             # we can assume that this page is the last one - there
1897             # are no more ids on further pages - no need to query
1898             # again.
1899
1900             if len(ids_in_page) < self._PAGE_SIZE:
1901                 break
1902
1903             pagenum += 1
1904
1905         all_ids_count = len(video_ids)
1906         playliststart = self._downloader.params.get('playliststart', 1) - 1
1907         playlistend = self._downloader.params.get('playlistend', -1)
1908
1909         if playlistend == -1:
1910             video_ids = video_ids[playliststart:]
1911         else:
1912             video_ids = video_ids[playliststart:playlistend]
1913
1914         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1915                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1916
1917         for video_id in video_ids:
1918             self._downloader.download([u'http://blip.tv/'+video_id])
1919
1920
1921 class DepositFilesIE(InfoExtractor):
1922     """Information extractor for depositfiles.com"""
1923
1924     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1925
1926     def report_download_webpage(self, file_id):
1927         """Report webpage download."""
1928         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1929
1930     def report_extraction(self, file_id):
1931         """Report information extraction."""
1932         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1933
1934     def _real_extract(self, url):
1935         file_id = url.split('/')[-1]
1936         # Rebuild url in english locale
1937         url = 'http://depositfiles.com/en/files/' + file_id
1938
1939         # Retrieve file webpage with 'Free download' button pressed
1940         free_download_indication = { 'gateway_result' : '1' }
1941         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1942         try:
1943             self.report_download_webpage(file_id)
1944             webpage = compat_urllib_request.urlopen(request).read()
1945         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1946             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1947             return
1948
1949         # Search for the real file URL
1950         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1951         if (mobj is None) or (mobj.group(1) is None):
1952             # Try to figure out reason of the error.
1953             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1954             if (mobj is not None) and (mobj.group(1) is not None):
1955                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1956                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1957             else:
1958                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1959             return
1960
1961         file_url = mobj.group(1)
1962         file_extension = os.path.splitext(file_url)[1][1:]
1963
1964         # Search for file title
1965         mobj = re.search(r'<b title="(.*?)">', webpage)
1966         if mobj is None:
1967             self._downloader.trouble(u'ERROR: unable to extract title')
1968             return
1969         file_title = mobj.group(1).decode('utf-8')
1970
1971         return [{
1972             'id':       file_id.decode('utf-8'),
1973             'url':      file_url.decode('utf-8'),
1974             'uploader': None,
1975             'upload_date':  None,
1976             'title':    file_title,
1977             'ext':      file_extension.decode('utf-8'),
1978         }]
1979
1980
1981 class FacebookIE(InfoExtractor):
1982     """Information Extractor for Facebook"""
1983
1984     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986     _NETRC_MACHINE = 'facebook'
1987     IE_NAME = u'facebook'
1988
1989     def report_login(self):
1990         """Report attempt to log in."""
1991         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1992
1993     def _real_initialize(self):
1994         if self._downloader is None:
1995             return
1996
1997         useremail = None
1998         password = None
1999         downloader_params = self._downloader.params
2000
2001         # Attempt to use provided username and password or .netrc data
2002         if downloader_params.get('username', None) is not None:
2003             useremail = downloader_params['username']
2004             password = downloader_params['password']
2005         elif downloader_params.get('usenetrc', False):
2006             try:
2007                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2008                 if info is not None:
2009                     useremail = info[0]
2010                     password = info[2]
2011                 else:
2012                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2013             except (IOError, netrc.NetrcParseError) as err:
2014                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2015                 return
2016
2017         if useremail is None:
2018             return
2019
2020         # Log in
2021         login_form = {
2022             'email': useremail,
2023             'pass': password,
2024             'login': 'Log+In'
2025             }
2026         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2027         try:
2028             self.report_login()
2029             login_results = compat_urllib_request.urlopen(request).read()
2030             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2031                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2032                 return
2033         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2035             return
2036
2037     def _real_extract(self, url):
2038         mobj = re.match(self._VALID_URL, url)
2039         if mobj is None:
2040             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2041             return
2042         video_id = mobj.group('ID')
2043
2044         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2045         webpage = self._download_webpage(url, video_id)
2046
2047         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2048         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2049         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2050         if not m:
2051             raise ExtractorError(u'Cannot parse data')
2052         data = dict(json.loads(m.group(1)))
2053         params_raw = compat_urllib_parse.unquote(data['params'])
2054         params = json.loads(params_raw)
2055         video_url = params['hd_src']
2056         video_duration = int(params['video_duration'])
2057
2058         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2059         if not m:
2060             raise ExtractorError(u'Cannot find title in webpage')
2061         video_title = unescapeHTML(m.group(1))
2062
2063         info = {
2064             'id': video_id,
2065             'title': video_title,
2066             'url': video_url,
2067             'ext': 'mp4',
2068             'duration': video_duration,
2069             'thumbnail': params['thumbnail_src'],
2070         }
2071         return [info]
2072
2073
2074 class BlipTVIE(InfoExtractor):
2075     """Information extractor for blip.tv"""
2076
2077     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2078     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2079     IE_NAME = u'blip.tv'
2080
2081     def report_extraction(self, file_id):
2082         """Report information extraction."""
2083         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2084
2085     def report_direct_download(self, title):
2086         """Report information extraction."""
2087         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2088
2089     def _real_extract(self, url):
2090         mobj = re.match(self._VALID_URL, url)
2091         if mobj is None:
2092             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2093             return
2094
2095         if '?' in url:
2096             cchar = '&'
2097         else:
2098             cchar = '?'
2099         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2100         request = compat_urllib_request.Request(json_url)
2101         request.add_header('User-Agent', 'iTunes/10.6.1')
2102         self.report_extraction(mobj.group(1))
2103         info = None
2104         try:
2105             urlh = compat_urllib_request.urlopen(request)
2106             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2107                 basename = url.split('/')[-1]
2108                 title,ext = os.path.splitext(basename)
2109                 title = title.decode('UTF-8')
2110                 ext = ext.replace('.', '')
2111                 self.report_direct_download(title)
2112                 info = {
2113                     'id': title,
2114                     'url': url,
2115                     'uploader': None,
2116                     'upload_date': None,
2117                     'title': title,
2118                     'ext': ext,
2119                     'urlhandle': urlh
2120                 }
2121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2122             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2123         if info is None: # Regular URL
2124             try:
2125                 json_code_bytes = urlh.read()
2126                 json_code = json_code_bytes.decode('utf-8')
2127             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2128                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2129                 return
2130
2131             try:
2132                 json_data = json.loads(json_code)
2133                 if 'Post' in json_data:
2134                     data = json_data['Post']
2135                 else:
2136                     data = json_data
2137
2138                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2139                 video_url = data['media']['url']
2140                 umobj = re.match(self._URL_EXT, video_url)
2141                 if umobj is None:
2142                     raise ValueError('Can not determine filename extension')
2143                 ext = umobj.group(1)
2144
2145                 info = {
2146                     'id': data['item_id'],
2147                     'url': video_url,
2148                     'uploader': data['display_name'],
2149                     'upload_date': upload_date,
2150                     'title': data['title'],
2151                     'ext': ext,
2152                     'format': data['media']['mimeType'],
2153                     'thumbnail': data['thumbnailUrl'],
2154                     'description': data['description'],
2155                     'player_url': data['embedUrl'],
2156                     'user_agent': 'iTunes/10.6.1',
2157                 }
2158             except (ValueError,KeyError) as err:
2159                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2160                 return
2161
2162         return [info]
2163
2164
2165 class MyVideoIE(InfoExtractor):
2166     """Information Extractor for myvideo.de."""
2167
2168     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2169     IE_NAME = u'myvideo'
2170
2171     def __init__(self, downloader=None):
2172         InfoExtractor.__init__(self, downloader)
2173
2174     def report_extraction(self, video_id):
2175         """Report information extraction."""
2176         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2177
2178     def _real_extract(self,url):
2179         mobj = re.match(self._VALID_URL, url)
2180         if mobj is None:
2181             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2182             return
2183
2184         video_id = mobj.group(1)
2185
2186         # Get video webpage
2187         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2188         webpage = self._download_webpage(webpage_url, video_id)
2189
2190         self.report_extraction(video_id)
2191         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2192                  webpage)
2193         if mobj is None:
2194             self._downloader.trouble(u'ERROR: unable to extract media URL')
2195             return
2196         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2197
2198         mobj = re.search('<title>([^<]+)</title>', webpage)
2199         if mobj is None:
2200             self._downloader.trouble(u'ERROR: unable to extract title')
2201             return
2202
2203         video_title = mobj.group(1)
2204
2205         return [{
2206             'id':       video_id,
2207             'url':      video_url,
2208             'uploader': None,
2209             'upload_date':  None,
2210             'title':    video_title,
2211             'ext':      u'flv',
2212         }]
2213
2214 class ComedyCentralIE(InfoExtractor):
2215     """Information extractor for The Daily Show and Colbert Report """
2216
2217     # urls can be abbreviations like :thedailyshow or :colbert
2218     # urls for episodes like:
2219     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2220     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2221     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2222     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2223                       |(https?://)?(www\.)?
2224                           (?P<showname>thedailyshow|colbertnation)\.com/
2225                          (full-episodes/(?P<episode>.*)|
2226                           (?P<clip>
2227                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2228                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2229                      $"""
2230
2231     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2232
2233     _video_extensions = {
2234         '3500': 'mp4',
2235         '2200': 'mp4',
2236         '1700': 'mp4',
2237         '1200': 'mp4',
2238         '750': 'mp4',
2239         '400': 'mp4',
2240     }
2241     _video_dimensions = {
2242         '3500': '1280x720',
2243         '2200': '960x540',
2244         '1700': '768x432',
2245         '1200': '640x360',
2246         '750': '512x288',
2247         '400': '384x216',
2248     }
2249
2250     def suitable(self, url):
2251         """Receives a URL and returns True if suitable for this IE."""
2252         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2253
2254     def report_extraction(self, episode_id):
2255         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2256
2257     def report_config_download(self, episode_id, media_id):
2258         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2259
2260     def report_index_download(self, episode_id):
2261         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2262
2263     def _print_formats(self, formats):
2264         print('Available formats:')
2265         for x in formats:
2266             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2267
2268
2269     def _real_extract(self, url):
2270         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2271         if mobj is None:
2272             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2273             return
2274
2275         if mobj.group('shortname'):
2276             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2277                 url = u'http://www.thedailyshow.com/full-episodes/'
2278             else:
2279                 url = u'http://www.colbertnation.com/full-episodes/'
2280             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2281             assert mobj is not None
2282
2283         if mobj.group('clip'):
2284             if mobj.group('showname') == 'thedailyshow':
2285                 epTitle = mobj.group('tdstitle')
2286             else:
2287                 epTitle = mobj.group('cntitle')
2288             dlNewest = False
2289         else:
2290             dlNewest = not mobj.group('episode')
2291             if dlNewest:
2292                 epTitle = mobj.group('showname')
2293             else:
2294                 epTitle = mobj.group('episode')
2295
2296         req = compat_urllib_request.Request(url)
2297         self.report_extraction(epTitle)
2298         try:
2299             htmlHandle = compat_urllib_request.urlopen(req)
2300             html = htmlHandle.read()
2301             webpage = html.decode('utf-8')
2302         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2303             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2304             return
2305         if dlNewest:
2306             url = htmlHandle.geturl()
2307             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2308             if mobj is None:
2309                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2310                 return
2311             if mobj.group('episode') == '':
2312                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2313                 return
2314             epTitle = mobj.group('episode')
2315
2316         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2317
2318         if len(mMovieParams) == 0:
2319             # The Colbert Report embeds the information in a without
2320             # a URL prefix; so extract the alternate reference
2321             # and then add the URL prefix manually.
2322
2323             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2324             if len(altMovieParams) == 0:
2325                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2326                 return
2327             else:
2328                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2329
2330         uri = mMovieParams[0][1]
2331         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2332         self.report_index_download(epTitle)
2333         try:
2334             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2335         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2336             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2337             return
2338
2339         results = []
2340
2341         idoc = xml.etree.ElementTree.fromstring(indexXml)
2342         itemEls = idoc.findall('.//item')
2343         for partNum,itemEl in enumerate(itemEls):
2344             mediaId = itemEl.findall('./guid')[0].text
2345             shortMediaId = mediaId.split(':')[-1]
2346             showId = mediaId.split(':')[-2].replace('.com', '')
2347             officialTitle = itemEl.findall('./title')[0].text
2348             officialDate = itemEl.findall('./pubDate')[0].text
2349
2350             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2351                         compat_urllib_parse.urlencode({'uri': mediaId}))
2352             configReq = compat_urllib_request.Request(configUrl)
2353             self.report_config_download(epTitle, shortMediaId)
2354             try:
2355                 configXml = compat_urllib_request.urlopen(configReq).read()
2356             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2357                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2358                 return
2359
2360             cdoc = xml.etree.ElementTree.fromstring(configXml)
2361             turls = []
2362             for rendition in cdoc.findall('.//rendition'):
2363                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2364                 turls.append(finfo)
2365
2366             if len(turls) == 0:
2367                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2368                 continue
2369
2370             if self._downloader.params.get('listformats', None):
2371                 self._print_formats([i[0] for i in turls])
2372                 return
2373
2374             # For now, just pick the highest bitrate
2375             format,rtmp_video_url = turls[-1]
2376
2377             # Get the format arg from the arg stream
2378             req_format = self._downloader.params.get('format', None)
2379
2380             # Select format if we can find one
2381             for f,v in turls:
2382                 if f == req_format:
2383                     format, rtmp_video_url = f, v
2384                     break
2385
2386             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2387             if not m:
2388                 raise ExtractorError(u'Cannot transform RTMP url')
2389             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2390             video_url = base + m.group('finalid')
2391
2392             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2393             info = {
2394                 'id': shortMediaId,
2395                 'url': video_url,
2396                 'uploader': showId,
2397                 'upload_date': officialDate,
2398                 'title': effTitle,
2399                 'ext': 'mp4',
2400                 'format': format,
2401                 'thumbnail': None,
2402                 'description': officialTitle,
2403             }
2404             results.append(info)
2405
2406         return results
2407
2408
2409 class EscapistIE(InfoExtractor):
2410     """Information extractor for The Escapist """
2411
2412     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2413     IE_NAME = u'escapist'
2414
2415     def report_extraction(self, showName):
2416         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2417
2418     def report_config_download(self, showName):
2419         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2420
2421     def _real_extract(self, url):
2422         mobj = re.match(self._VALID_URL, url)
2423         if mobj is None:
2424             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2425             return
2426         showName = mobj.group('showname')
2427         videoId = mobj.group('episode')
2428
2429         self.report_extraction(showName)
2430         try:
2431             webPage = compat_urllib_request.urlopen(url)
2432             webPageBytes = webPage.read()
2433             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2434             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2435         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2436             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2437             return
2438
2439         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2440         description = unescapeHTML(descMatch.group(1))
2441         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2442         imgUrl = unescapeHTML(imgMatch.group(1))
2443         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2444         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2445         configUrlMatch = re.search('config=(.*)$', playerUrl)
2446         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2447
2448         self.report_config_download(showName)
2449         try:
2450             configJSON = compat_urllib_request.urlopen(configUrl)
2451             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2452             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2453         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2455             return
2456
2457         # Technically, it's JavaScript, not JSON
2458         configJSON = configJSON.replace("'", '"')
2459
2460         try:
2461             config = json.loads(configJSON)
2462         except (ValueError,) as err:
2463             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2464             return
2465
2466         playlist = config['playlist']
2467         videoUrl = playlist[1]['url']
2468
2469         info = {
2470             'id': videoId,
2471             'url': videoUrl,
2472             'uploader': showName,
2473             'upload_date': None,
2474             'title': showName,
2475             'ext': 'flv',
2476             'thumbnail': imgUrl,
2477             'description': description,
2478             'player_url': playerUrl,
2479         }
2480
2481         return [info]
2482
2483 class CollegeHumorIE(InfoExtractor):
2484     """Information extractor for collegehumor.com"""
2485
2486     _WORKING = False
2487     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2488     IE_NAME = u'collegehumor'
2489
2490     def report_manifest(self, video_id):
2491         """Report information extraction."""
2492         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2493
2494     def report_extraction(self, video_id):
2495         """Report information extraction."""
2496         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2497
2498     def _real_extract(self, url):
2499         mobj = re.match(self._VALID_URL, url)
2500         if mobj is None:
2501             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2502             return
2503         video_id = mobj.group('videoid')
2504
2505         info = {
2506             'id': video_id,
2507             'uploader': None,
2508             'upload_date': None,
2509         }
2510
2511         self.report_extraction(video_id)
2512         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2513         try:
2514             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2515         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2516             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2517             return
2518
2519         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2520         try:
2521             videoNode = mdoc.findall('./video')[0]
2522             info['description'] = videoNode.findall('./description')[0].text
2523             info['title'] = videoNode.findall('./caption')[0].text
2524             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2525             manifest_url = videoNode.findall('./file')[0].text
2526         except IndexError:
2527             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2528             return
2529
2530         manifest_url += '?hdcore=2.10.3'
2531         self.report_manifest(video_id)
2532         try:
2533             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2534         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2535             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2536             return
2537
2538         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2539         try:
2540             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2541             node_id = media_node.attrib['url']
2542             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2543         except IndexError as err:
2544             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2545             return
2546
2547         url_pr = compat_urllib_parse_urlparse(manifest_url)
2548         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2549
2550         info['url'] = url
2551         info['ext'] = 'f4f'
2552         return [info]
2553
2554
2555 class XVideosIE(InfoExtractor):
2556     """Information extractor for xvideos.com"""
2557
2558     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2559     IE_NAME = u'xvideos'
2560
2561     def report_extraction(self, video_id):
2562         """Report information extraction."""
2563         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2564
2565     def _real_extract(self, url):
2566         mobj = re.match(self._VALID_URL, url)
2567         if mobj is None:
2568             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2569             return
2570         video_id = mobj.group(1)
2571
2572         webpage = self._download_webpage(url, video_id)
2573
2574         self.report_extraction(video_id)
2575
2576
2577         # Extract video URL
2578         mobj = re.search(r'flv_url=(.+?)&', webpage)
2579         if mobj is None:
2580             self._downloader.trouble(u'ERROR: unable to extract video url')
2581             return
2582         video_url = compat_urllib_parse.unquote(mobj.group(1))
2583
2584
2585         # Extract title
2586         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2587         if mobj is None:
2588             self._downloader.trouble(u'ERROR: unable to extract video title')
2589             return
2590         video_title = mobj.group(1)
2591
2592
2593         # Extract video thumbnail
2594         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2595         if mobj is None:
2596             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2597             return
2598         video_thumbnail = mobj.group(0)
2599
2600         info = {
2601             'id': video_id,
2602             'url': video_url,
2603             'uploader': None,
2604             'upload_date': None,
2605             'title': video_title,
2606             'ext': 'flv',
2607             'thumbnail': video_thumbnail,
2608             'description': None,
2609         }
2610
2611         return [info]
2612
2613
2614 class SoundcloudIE(InfoExtractor):
2615     """Information extractor for soundcloud.com
2616        To access the media, the uid of the song and a stream token
2617        must be extracted from the page source and the script must make
2618        a request to media.soundcloud.com/crossdomain.xml. Then
2619        the media can be grabbed by requesting from an url composed
2620        of the stream token and uid
2621      """
2622
2623     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2624     IE_NAME = u'soundcloud'
2625
2626     def __init__(self, downloader=None):
2627         InfoExtractor.__init__(self, downloader)
2628
2629     def report_resolve(self, video_id):
2630         """Report information extraction."""
2631         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2632
2633     def report_extraction(self, video_id):
2634         """Report information extraction."""
2635         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2636
2637     def _real_extract(self, url):
2638         mobj = re.match(self._VALID_URL, url)
2639         if mobj is None:
2640             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2641             return
2642
2643         # extract uploader (which is in the url)
2644         uploader = mobj.group(1)
2645         # extract simple title (uploader + slug of song title)
2646         slug_title =  mobj.group(2)
2647         simple_title = uploader + u'-' + slug_title
2648
2649         self.report_resolve('%s/%s' % (uploader, slug_title))
2650
2651         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2652         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2653         request = compat_urllib_request.Request(resolv_url)
2654         try:
2655             info_json_bytes = compat_urllib_request.urlopen(request).read()
2656             info_json = info_json_bytes.decode('utf-8')
2657         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2658             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2659             return
2660
2661         info = json.loads(info_json)
2662         video_id = info['id']
2663         self.report_extraction('%s/%s' % (uploader, slug_title))
2664
2665         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2666         request = compat_urllib_request.Request(streams_url)
2667         try:
2668             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2669             stream_json = stream_json_bytes.decode('utf-8')
2670         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2671             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2672             return
2673
2674         streams = json.loads(stream_json)
2675         mediaURL = streams['http_mp3_128_url']
2676
2677         return [{
2678             'id':       info['id'],
2679             'url':      mediaURL,
2680             'uploader': info['user']['username'],
2681             'upload_date':  info['created_at'],
2682             'title':    info['title'],
2683             'ext':      u'mp3',
2684             'description': info['description'],
2685         }]
2686
2687
2688 class InfoQIE(InfoExtractor):
2689     """Information extractor for infoq.com"""
2690     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2691
2692     def report_extraction(self, video_id):
2693         """Report information extraction."""
2694         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2695
2696     def _real_extract(self, url):
2697         mobj = re.match(self._VALID_URL, url)
2698         if mobj is None:
2699             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2700             return
2701
2702         webpage = self._download_webpage(url, video_id=url)
2703         self.report_extraction(url)
2704
2705         # Extract video URL
2706         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2707         if mobj is None:
2708             self._downloader.trouble(u'ERROR: unable to extract video url')
2709             return
2710         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2711         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2712
2713         # Extract title
2714         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2715         if mobj is None:
2716             self._downloader.trouble(u'ERROR: unable to extract video title')
2717             return
2718         video_title = mobj.group(1)
2719
2720         # Extract description
2721         video_description = u'No description available.'
2722         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2723         if mobj is not None:
2724             video_description = mobj.group(1)
2725
2726         video_filename = video_url.split('/')[-1]
2727         video_id, extension = video_filename.split('.')
2728
2729         info = {
2730             'id': video_id,
2731             'url': video_url,
2732             'uploader': None,
2733             'upload_date': None,
2734             'title': video_title,
2735             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2736             'thumbnail': None,
2737             'description': video_description,
2738         }
2739
2740         return [info]
2741
2742 class MixcloudIE(InfoExtractor):
2743     """Information extractor for www.mixcloud.com"""
2744
2745     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2746     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2747     IE_NAME = u'mixcloud'
2748
2749     def __init__(self, downloader=None):
2750         InfoExtractor.__init__(self, downloader)
2751
2752     def report_download_json(self, file_id):
2753         """Report JSON download."""
2754         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2755
2756     def report_extraction(self, file_id):
2757         """Report information extraction."""
2758         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2759
2760     def get_urls(self, jsonData, fmt, bitrate='best'):
2761         """Get urls from 'audio_formats' section in json"""
2762         file_url = None
2763         try:
2764             bitrate_list = jsonData[fmt]
2765             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2766                 bitrate = max(bitrate_list) # select highest
2767
2768             url_list = jsonData[fmt][bitrate]
2769         except TypeError: # we have no bitrate info.
2770             url_list = jsonData[fmt]
2771         return url_list
2772
2773     def check_urls(self, url_list):
2774         """Returns 1st active url from list"""
2775         for url in url_list:
2776             try:
2777                 compat_urllib_request.urlopen(url)
2778                 return url
2779             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2780                 url = None
2781
2782         return None
2783
2784     def _print_formats(self, formats):
2785         print('Available formats:')
2786         for fmt in formats.keys():
2787             for b in formats[fmt]:
2788                 try:
2789                     ext = formats[fmt][b][0]
2790                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2791                 except TypeError: # we have no bitrate info
2792                     ext = formats[fmt][0]
2793                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2794                     break
2795
2796     def _real_extract(self, url):
2797         mobj = re.match(self._VALID_URL, url)
2798         if mobj is None:
2799             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2800             return
2801         # extract uploader & filename from url
2802         uploader = mobj.group(1).decode('utf-8')
2803         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2804
2805         # construct API request
2806         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2807         # retrieve .json file with links to files
2808         request = compat_urllib_request.Request(file_url)
2809         try:
2810             self.report_download_json(file_url)
2811             jsonData = compat_urllib_request.urlopen(request).read()
2812         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2814             return
2815
2816         # parse JSON
2817         json_data = json.loads(jsonData)
2818         player_url = json_data['player_swf_url']
2819         formats = dict(json_data['audio_formats'])
2820
2821         req_format = self._downloader.params.get('format', None)
2822         bitrate = None
2823
2824         if self._downloader.params.get('listformats', None):
2825             self._print_formats(formats)
2826             return
2827
2828         if req_format is None or req_format == 'best':
2829             for format_param in formats.keys():
2830                 url_list = self.get_urls(formats, format_param)
2831                 # check urls
2832                 file_url = self.check_urls(url_list)
2833                 if file_url is not None:
2834                     break # got it!
2835         else:
2836             if req_format not in formats:
2837                 self._downloader.trouble(u'ERROR: format is not available')
2838                 return
2839
2840             url_list = self.get_urls(formats, req_format)
2841             file_url = self.check_urls(url_list)
2842             format_param = req_format
2843
2844         return [{
2845             'id': file_id.decode('utf-8'),
2846             'url': file_url.decode('utf-8'),
2847             'uploader': uploader.decode('utf-8'),
2848             'upload_date': None,
2849             'title': json_data['name'],
2850             'ext': file_url.split('.')[-1].decode('utf-8'),
2851             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2852             'thumbnail': json_data['thumbnail_url'],
2853             'description': json_data['description'],
2854             'player_url': player_url.decode('utf-8'),
2855         }]
2856
2857 class StanfordOpenClassroomIE(InfoExtractor):
2858     """Information extractor for Stanford's Open ClassRoom"""
2859
2860     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2861     IE_NAME = u'stanfordoc'
2862
2863     def report_download_webpage(self, objid):
2864         """Report information extraction."""
2865         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2866
2867     def report_extraction(self, video_id):
2868         """Report information extraction."""
2869         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2870
2871     def _real_extract(self, url):
2872         mobj = re.match(self._VALID_URL, url)
2873         if mobj is None:
2874             raise ExtractorError(u'Invalid URL: %s' % url)
2875
2876         if mobj.group('course') and mobj.group('video'): # A specific video
2877             course = mobj.group('course')
2878             video = mobj.group('video')
2879             info = {
2880                 'id': course + '_' + video,
2881                 'uploader': None,
2882                 'upload_date': None,
2883             }
2884
2885             self.report_extraction(info['id'])
2886             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2887             xmlUrl = baseUrl + video + '.xml'
2888             try:
2889                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2890             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2891                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2892                 return
2893             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2894             try:
2895                 info['title'] = mdoc.findall('./title')[0].text
2896                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2897             except IndexError:
2898                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2899                 return
2900             info['ext'] = info['url'].rpartition('.')[2]
2901             return [info]
2902         elif mobj.group('course'): # A course page
2903             course = mobj.group('course')
2904             info = {
2905                 'id': course,
2906                 'type': 'playlist',
2907                 'uploader': None,
2908                 'upload_date': None,
2909             }
2910
2911             coursepage = self._download_webpage(url, info['id'],
2912                                         note='Downloading course info page',
2913                                         errnote='Unable to download course info page')
2914
2915             m = re.search('<h1>([^<]+)</h1>', coursepage)
2916             if m:
2917                 info['title'] = unescapeHTML(m.group(1))
2918             else:
2919                 info['title'] = info['id']
2920
2921             m = re.search('<description>([^<]+)</description>', coursepage)
2922             if m:
2923                 info['description'] = unescapeHTML(m.group(1))
2924
2925             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2926             info['list'] = [
2927                 {
2928                     'type': 'reference',
2929                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2930                 }
2931                     for vpage in links]
2932             results = []
2933             for entry in info['list']:
2934                 assert entry['type'] == 'reference'
2935                 results += self.extract(entry['url'])
2936             return results
2937         else: # Root page
2938             info = {
2939                 'id': 'Stanford OpenClassroom',
2940                 'type': 'playlist',
2941                 'uploader': None,
2942                 'upload_date': None,
2943             }
2944
2945             self.report_download_webpage(info['id'])
2946             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2947             try:
2948                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2949             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2950                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2951                 return
2952
2953             info['title'] = info['id']
2954
2955             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2956             info['list'] = [
2957                 {
2958                     'type': 'reference',
2959                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2960                 }
2961                     for cpage in links]
2962
2963             results = []
2964             for entry in info['list']:
2965                 assert entry['type'] == 'reference'
2966                 results += self.extract(entry['url'])
2967             return results
2968
2969 class MTVIE(InfoExtractor):
2970     """Information extractor for MTV.com"""
2971
2972     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2973     IE_NAME = u'mtv'
2974
2975     def report_extraction(self, video_id):
2976         """Report information extraction."""
2977         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2978
2979     def _real_extract(self, url):
2980         mobj = re.match(self._VALID_URL, url)
2981         if mobj is None:
2982             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2983             return
2984         if not mobj.group('proto'):
2985             url = 'http://' + url
2986         video_id = mobj.group('videoid')
2987
2988         webpage = self._download_webpage(url, video_id)
2989
2990         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2991         if mobj is None:
2992             self._downloader.trouble(u'ERROR: unable to extract song name')
2993             return
2994         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2995         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2996         if mobj is None:
2997             self._downloader.trouble(u'ERROR: unable to extract performer')
2998             return
2999         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3000         video_title = performer + ' - ' + song_name
3001
3002         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3003         if mobj is None:
3004             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3005             return
3006         mtvn_uri = mobj.group(1)
3007
3008         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3009         if mobj is None:
3010             self._downloader.trouble(u'ERROR: unable to extract content id')
3011             return
3012         content_id = mobj.group(1)
3013
3014         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3015         self.report_extraction(video_id)
3016         request = compat_urllib_request.Request(videogen_url)
3017         try:
3018             metadataXml = compat_urllib_request.urlopen(request).read()
3019         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3020             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3021             return
3022
3023         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3024         renditions = mdoc.findall('.//rendition')
3025
3026         # For now, always pick the highest quality.
3027         rendition = renditions[-1]
3028
3029         try:
3030             _,_,ext = rendition.attrib['type'].partition('/')
3031             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3032             video_url = rendition.find('./src').text
3033         except KeyError:
3034             self._downloader.trouble('Invalid rendition field.')
3035             return
3036
3037         info = {
3038             'id': video_id,
3039             'url': video_url,
3040             'uploader': performer,
3041             'upload_date': None,
3042             'title': video_title,
3043             'ext': ext,
3044             'format': format,
3045         }
3046
3047         return [info]
3048
3049
3050 class YoukuIE(InfoExtractor):
3051     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3052
3053     def report_download_webpage(self, file_id):
3054         """Report webpage download."""
3055         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3056
3057     def report_extraction(self, file_id):
3058         """Report information extraction."""
3059         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3060
3061     def _gen_sid(self):
3062         nowTime = int(time.time() * 1000)
3063         random1 = random.randint(1000,1998)
3064         random2 = random.randint(1000,9999)
3065
3066         return "%d%d%d" %(nowTime,random1,random2)
3067
3068     def _get_file_ID_mix_string(self, seed):
3069         mixed = []
3070         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3071         seed = float(seed)
3072         for i in range(len(source)):
3073             seed  =  (seed * 211 + 30031 ) % 65536
3074             index  =  math.floor(seed / 65536 * len(source) )
3075             mixed.append(source[int(index)])
3076             source.remove(source[int(index)])
3077         #return ''.join(mixed)
3078         return mixed
3079
3080     def _get_file_id(self, fileId, seed):
3081         mixed = self._get_file_ID_mix_string(seed)
3082         ids = fileId.split('*')
3083         realId = []
3084         for ch in ids:
3085             if ch:
3086                 realId.append(mixed[int(ch)])
3087         return ''.join(realId)
3088
3089     def _real_extract(self, url):
3090         mobj = re.match(self._VALID_URL, url)
3091         if mobj is None:
3092             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3093             return
3094         video_id = mobj.group('ID')
3095
3096         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3097
3098         request = compat_urllib_request.Request(info_url, None, std_headers)
3099         try:
3100             self.report_download_webpage(video_id)
3101             jsondata = compat_urllib_request.urlopen(request).read()
3102         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3104             return
3105
3106         self.report_extraction(video_id)
3107         try:
3108             jsonstr = jsondata.decode('utf-8')
3109             config = json.loads(jsonstr)
3110
3111             video_title =  config['data'][0]['title']
3112             seed = config['data'][0]['seed']
3113
3114             format = self._downloader.params.get('format', None)
3115             supported_format = list(config['data'][0]['streamfileids'].keys())
3116
3117             if format is None or format == 'best':
3118                 if 'hd2' in supported_format:
3119                     format = 'hd2'
3120                 else:
3121                     format = 'flv'
3122                 ext = u'flv'
3123             elif format == 'worst':
3124                 format = 'mp4'
3125                 ext = u'mp4'
3126             else:
3127                 format = 'flv'
3128                 ext = u'flv'
3129
3130
3131             fileid = config['data'][0]['streamfileids'][format]
3132             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3133         except (UnicodeDecodeError, ValueError, KeyError):
3134             self._downloader.trouble(u'ERROR: unable to extract info section')
3135             return
3136
3137         files_info=[]
3138         sid = self._gen_sid()
3139         fileid = self._get_file_id(fileid, seed)
3140
3141         #column 8,9 of fileid represent the segment number
3142         #fileid[7:9] should be changed
3143         for index, key in enumerate(keys):
3144
3145             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3146             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3147
3148             info = {
3149                 'id': '%s_part%02d' % (video_id, index),
3150                 'url': download_url,
3151                 'uploader': None,
3152                 'upload_date': None,
3153                 'title': video_title,
3154                 'ext': ext,
3155             }
3156             files_info.append(info)
3157
3158         return files_info
3159
3160
3161 class XNXXIE(InfoExtractor):
3162     """Information extractor for xnxx.com"""
3163
3164     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3165     IE_NAME = u'xnxx'
3166     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3167     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3168     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3169
3170     def report_webpage(self, video_id):
3171         """Report information extraction"""
3172         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3173
3174     def report_extraction(self, video_id):
3175         """Report information extraction"""
3176         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3177
3178     def _real_extract(self, url):
3179         mobj = re.match(self._VALID_URL, url)
3180         if mobj is None:
3181             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3182             return
3183         video_id = mobj.group(1)
3184
3185         self.report_webpage(video_id)
3186
3187         # Get webpage content
3188         try:
3189             webpage_bytes = compat_urllib_request.urlopen(url).read()
3190             webpage = webpage_bytes.decode('utf-8')
3191         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3192             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3193             return
3194
3195         result = re.search(self.VIDEO_URL_RE, webpage)
3196         if result is None:
3197             self._downloader.trouble(u'ERROR: unable to extract video url')
3198             return
3199         video_url = compat_urllib_parse.unquote(result.group(1))
3200
3201         result = re.search(self.VIDEO_TITLE_RE, webpage)
3202         if result is None:
3203             self._downloader.trouble(u'ERROR: unable to extract video title')
3204             return
3205         video_title = result.group(1)
3206
3207         result = re.search(self.VIDEO_THUMB_RE, webpage)
3208         if result is None:
3209             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3210             return
3211         video_thumbnail = result.group(1)
3212
3213         return [{
3214             'id': video_id,
3215             'url': video_url,
3216             'uploader': None,
3217             'upload_date': None,
3218             'title': video_title,
3219             'ext': 'flv',
3220             'thumbnail': video_thumbnail,
3221             'description': None,
3222         }]
3223
3224
3225 class GooglePlusIE(InfoExtractor):
3226     """Information extractor for plus.google.com."""
3227
3228     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3229     IE_NAME = u'plus.google'
3230
3231     def __init__(self, downloader=None):
3232         InfoExtractor.__init__(self, downloader)
3233
3234     def report_extract_entry(self, url):
3235         """Report downloading extry"""
3236         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3237
3238     def report_date(self, upload_date):
3239         """Report downloading extry"""
3240         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3241
3242     def report_uploader(self, uploader):
3243         """Report downloading extry"""
3244         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3245
3246     def report_title(self, video_title):
3247         """Report downloading extry"""
3248         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3249
3250     def report_extract_vid_page(self, video_page):
3251         """Report information extraction."""
3252         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3253
3254     def _real_extract(self, url):
3255         # Extract id from URL
3256         mobj = re.match(self._VALID_URL, url)
3257         if mobj is None:
3258             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3259             return
3260
3261         post_url = mobj.group(0)
3262         video_id = mobj.group(1)
3263
3264         video_extension = 'flv'
3265
3266         # Step 1, Retrieve post webpage to extract further information
3267         self.report_extract_entry(post_url)
3268         request = compat_urllib_request.Request(post_url)
3269         try:
3270             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3271         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3272             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3273             return
3274
3275         # Extract update date
3276         upload_date = None
3277         pattern = 'title="Timestamp">(.*?)</a>'
3278         mobj = re.search(pattern, webpage)
3279         if mobj:
3280             upload_date = mobj.group(1)
3281             # Convert timestring to a format suitable for filename
3282             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3283             upload_date = upload_date.strftime('%Y%m%d')
3284         self.report_date(upload_date)
3285
3286         # Extract uploader
3287         uploader = None
3288         pattern = r'rel\="author".*?>(.*?)</a>'
3289         mobj = re.search(pattern, webpage)
3290         if mobj:
3291             uploader = mobj.group(1)
3292         self.report_uploader(uploader)
3293
3294         # Extract title
3295         # Get the first line for title
3296         video_title = u'NA'
3297         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3298         mobj = re.search(pattern, webpage)
3299         if mobj:
3300             video_title = mobj.group(1)
3301         self.report_title(video_title)
3302
3303         # Step 2, Stimulate clicking the image box to launch video
3304         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3305         mobj = re.search(pattern, webpage)
3306         if mobj is None:
3307             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3308
3309         video_page = mobj.group(1)
3310         request = compat_urllib_request.Request(video_page)
3311         try:
3312             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3313         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3314             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3315             return
3316         self.report_extract_vid_page(video_page)
3317
3318
3319         # Extract video links on video page
3320         """Extract video links of all sizes"""
3321         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3322         mobj = re.findall(pattern, webpage)
3323         if len(mobj) == 0:
3324             self._downloader.trouble(u'ERROR: unable to extract video links')
3325
3326         # Sort in resolution
3327         links = sorted(mobj)
3328
3329         # Choose the lowest of the sort, i.e. highest resolution
3330         video_url = links[-1]
3331         # Only get the url. The resolution part in the tuple has no use anymore
3332         video_url = video_url[-1]
3333         # Treat escaped \u0026 style hex
3334         try:
3335             video_url = video_url.decode("unicode_escape")
3336         except AttributeError: # Python 3
3337             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3338
3339
3340         return [{
3341             'id':       video_id,
3342             'url':      video_url,
3343             'uploader': uploader,
3344             'upload_date':  upload_date,
3345             'title':    video_title,
3346             'ext':      video_extension,
3347         }]
3348
3349 class NBAIE(InfoExtractor):
3350     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3351     IE_NAME = u'nba'
3352
3353     def _real_extract(self, url):
3354         mobj = re.match(self._VALID_URL, url)
3355         if mobj is None:
3356             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3357             return
3358
3359         video_id = mobj.group(1)
3360         if video_id.endswith('/index.html'):
3361             video_id = video_id[:-len('/index.html')]
3362
3363         webpage = self._download_webpage(url, video_id)
3364
3365         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3366         def _findProp(rexp, default=None):
3367             m = re.search(rexp, webpage)
3368             if m:
3369                 return unescapeHTML(m.group(1))
3370             else:
3371                 return default
3372
3373         shortened_video_id = video_id.rpartition('/')[2]
3374         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3375         info = {
3376             'id': shortened_video_id,
3377             'url': video_url,
3378             'ext': 'mp4',
3379             'title': title,
3380             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3381             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3382         }
3383         return [info]
3384
3385 class JustinTVIE(InfoExtractor):
3386     """Information extractor for justin.tv and twitch.tv"""
3387     # TODO: One broadcast may be split into multiple videos. The key
3388     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3389     # starts at 1 and increases. Can we treat all parts as one video?
3390
3391     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3392         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3393     _JUSTIN_PAGE_LIMIT = 100
3394     IE_NAME = u'justin.tv'
3395
3396     def report_extraction(self, file_id):
3397         """Report information extraction."""
3398         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3399
3400     def report_download_page(self, channel, offset):
3401         """Report attempt to download a single page of videos."""
3402         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3403                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3404
3405     # Return count of items, list of *valid* items
3406     def _parse_page(self, url):
3407         try:
3408             urlh = compat_urllib_request.urlopen(url)
3409             webpage_bytes = urlh.read()
3410             webpage = webpage_bytes.decode('utf-8', 'ignore')
3411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3412             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3413             return
3414
3415         response = json.loads(webpage)
3416         if type(response) != list:
3417             error_text = response.get('error', 'unknown error')
3418             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3419             return
3420         info = []
3421         for clip in response:
3422             video_url = clip['video_file_url']
3423             if video_url:
3424                 video_extension = os.path.splitext(video_url)[1][1:]
3425                 video_date = re.sub('-', '', clip['start_time'][:10])
3426                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3427                 video_id = clip['id']
3428                 video_title = clip.get('title', video_id)
3429                 info.append({
3430                     'id': video_id,
3431                     'url': video_url,
3432                     'title': video_title,
3433                     'uploader': clip.get('channel_name', video_uploader_id),
3434                     'uploader_id': video_uploader_id,
3435                     'upload_date': video_date,
3436                     'ext': video_extension,
3437                 })
3438         return (len(response), info)
3439
3440     def _real_extract(self, url):
3441         mobj = re.match(self._VALID_URL, url)
3442         if mobj is None:
3443             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3444             return
3445
3446         api = 'http://api.justin.tv'
3447         video_id = mobj.group(mobj.lastindex)
3448         paged = False
3449         if mobj.lastindex == 1:
3450             paged = True
3451             api += '/channel/archives/%s.json'
3452         else:
3453             api += '/broadcast/by_archive/%s.json'
3454         api = api % (video_id,)
3455
3456         self.report_extraction(video_id)
3457
3458         info = []
3459         offset = 0
3460         limit = self._JUSTIN_PAGE_LIMIT
3461         while True:
3462             if paged:
3463                 self.report_download_page(video_id, offset)
3464             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3465             page_count, page_info = self._parse_page(page_url)
3466             info.extend(page_info)
3467             if not paged or page_count != limit:
3468                 break
3469             offset += limit
3470         return info
3471
3472 class FunnyOrDieIE(InfoExtractor):
3473     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3474
3475     def _real_extract(self, url):
3476         mobj = re.match(self._VALID_URL, url)
3477         if mobj is None:
3478             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3479             return
3480
3481         video_id = mobj.group('id')
3482         webpage = self._download_webpage(url, video_id)
3483
3484         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3485         if not m:
3486             self._downloader.trouble(u'ERROR: unable to find video information')
3487         video_url = unescapeHTML(m.group('url'))
3488
3489         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3490         if not m:
3491             self._downloader.trouble(u'Cannot find video title')
3492         title = unescapeHTML(m.group('title'))
3493
3494         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3495         if m:
3496             desc = unescapeHTML(m.group('desc'))
3497         else:
3498             desc = None
3499
3500         info = {
3501             'id': video_id,
3502             'url': video_url,
3503             'ext': 'mp4',
3504             'title': title,
3505             'description': desc,
3506         }
3507         return [info]
3508
3509 class TweetReelIE(InfoExtractor):
3510     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3511
3512     def _real_extract(self, url):
3513         mobj = re.match(self._VALID_URL, url)
3514         if mobj is None:
3515             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3516             return
3517
3518         video_id = mobj.group('id')
3519         webpage = self._download_webpage(url, video_id)
3520
3521         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3522         if not m:
3523             self._downloader.trouble(u'ERROR: Cannot find status ID')
3524         status_id = m.group(1)
3525
3526         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3527         if not m:
3528             self._downloader.trouble(u'WARNING: Cannot find description')
3529         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3530
3531         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3532         if not m:
3533             self._downloader.trouble(u'ERROR: Cannot find uploader')
3534         uploader = unescapeHTML(m.group('uploader'))
3535         uploader_id = unescapeHTML(m.group('uploader_id'))
3536
3537         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3538         if not m:
3539             self._downloader.trouble(u'ERROR: Cannot find upload date')
3540         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3541
3542         title = desc
3543         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3544
3545         info = {
3546             'id': video_id,
3547             'url': video_url,
3548             'ext': 'mov',
3549             'title': title,
3550             'description': desc,
3551             'uploader': uploader,
3552             'uploader_id': uploader_id,
3553             'internal_id': status_id,
3554             'upload_date': upload_date
3555         }
3556         return [info]
3557
3558 class SteamIE(InfoExtractor):
3559     _VALID_URL = r"""http://store.steampowered.com/
3560                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3561                 (?P<gameID>\d+)/?
3562                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3563                 """
3564
3565     def suitable(self, url):
3566         """Receives a URL and returns True if suitable for this IE."""
3567         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3568
3569     def _real_extract(self, url):
3570         m = re.match(self._VALID_URL, url, re.VERBOSE)
3571         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3572         gameID = m.group('gameID')
3573         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3574         webpage = self._download_webpage(videourl, gameID)
3575         mweb = re.finditer(urlRE, webpage)
3576         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3577         titles = re.finditer(namesRE, webpage)
3578         videos = []
3579         for vid,vtitle in zip(mweb,titles):
3580             video_id = vid.group('videoID')
3581             title = vtitle.group('videoName')
3582             video_url = vid.group('videoURL')
3583             if not video_url:
3584                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3585             info = {
3586                 'id':video_id,
3587                 'url':video_url,
3588                 'ext': 'flv',
3589                 'title': unescapeHTML(title)
3590                   }
3591             videos.append(info)
3592         return videos
3593
3594 class UstreamIE(InfoExtractor):
3595     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3596     IE_NAME = u'ustream'
3597
3598     def _real_extract(self, url):
3599         m = re.match(self._VALID_URL, url)
3600         video_id = m.group('videoID')
3601         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3602         webpage = self._download_webpage(url, video_id)
3603         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3604         title = m.group('title')
3605         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3606         uploader = m.group('uploader')
3607         info = {
3608                 'id':video_id,
3609                 'url':video_url,
3610                 'ext': 'flv',
3611                 'title': title,
3612                 'uploader': uploader
3613                   }
3614         return [info]
3615
3616 class RBMARadioIE(InfoExtractor):
3617     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3618
3619     def _real_extract(self, url):
3620         m = re.match(self._VALID_URL, url)
3621         video_id = m.group('videoID')
3622
3623         webpage = self._download_webpage(url, video_id)
3624         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3625         if not m:
3626             raise ExtractorError(u'Cannot find metadata')
3627         json_data = m.group(1)
3628
3629         try:
3630             data = json.loads(json_data)
3631         except ValueError as e:
3632             raise ExtractorError(u'Invalid JSON: ' + str(e))
3633
3634         video_url = data['akamai_url'] + '&cbr=256'
3635         url_parts = compat_urllib_parse_urlparse(video_url)
3636         video_ext = url_parts.path.rpartition('.')[2]
3637         info = {
3638                 'id': video_id,
3639                 'url': video_url,
3640                 'ext': video_ext,
3641                 'title': data['title'],
3642                 'description': data.get('teaser_text'),
3643                 'location': data.get('country_of_origin'),
3644                 'uploader': data.get('host', {}).get('name'),
3645                 'uploader_id': data.get('host', {}).get('slug'),
3646                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3647                 'duration': data.get('duration'),
3648         }
3649         return [info]
3650
3651
3652 class YouPornIE(InfoExtractor):
3653     """Information extractor for youporn.com."""
3654     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3655
3656     def _print_formats(self, formats):
3657         """Print all available formats"""
3658         print(u'Available formats:')
3659         print(u'ext\t\tformat')
3660         print(u'---------------------------------')
3661         for format in formats:
3662             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3663
3664     def _specific(self, req_format, formats):
3665         for x in formats:
3666             if(x["format"]==req_format):
3667                 return x
3668         return None
3669
3670     def _real_extract(self, url):
3671         mobj = re.match(self._VALID_URL, url)
3672         if mobj is None:
3673             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3674             return
3675
3676         video_id = mobj.group('videoid')
3677
3678         req = compat_urllib_request.Request(url)
3679         req.add_header('Cookie', 'age_verified=1')
3680         webpage = self._download_webpage(req, video_id)
3681
3682         # Get the video title
3683         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3684         if result is None:
3685             raise ExtractorError(u'ERROR: unable to extract video title')
3686         video_title = result.group('title').strip()
3687
3688         # Get the video date
3689         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3690         if result is None:
3691             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3692             upload_date = None
3693         else:
3694             upload_date = result.group('date').strip()
3695
3696         # Get the video uploader
3697         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3698         if result is None:
3699             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3700             video_uploader = None
3701         else:
3702             video_uploader = result.group('uploader').strip()
3703             video_uploader = clean_html( video_uploader )
3704
3705         # Get all of the formats available
3706         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3707         result = re.search(DOWNLOAD_LIST_RE, webpage)
3708         if result is None:
3709             raise ExtractorError(u'Unable to extract download list')
3710         download_list_html = result.group('download_list').strip()
3711
3712         # Get all of the links from the page
3713         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3714         links = re.findall(LINK_RE, download_list_html)
3715         if(len(links) == 0):
3716             raise ExtractorError(u'ERROR: no known formats available for video')
3717
3718         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3719
3720         formats = []
3721         for link in links:
3722
3723             # A link looks like this:
3724             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3725             # A path looks like this:
3726             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3727             video_url = unescapeHTML( link )
3728             path = compat_urllib_parse_urlparse( video_url ).path
3729             extension = os.path.splitext( path )[1][1:]
3730             format = path.split('/')[4].split('_')[:2]
3731             size = format[0]
3732             bitrate = format[1]
3733             format = "-".join( format )
3734             title = u'%s-%s-%s' % (video_title, size, bitrate)
3735
3736             formats.append({
3737                 'id': video_id,
3738                 'url': video_url,
3739                 'uploader': video_uploader,
3740                 'upload_date': upload_date,
3741                 'title': title,
3742                 'ext': extension,
3743                 'format': format,
3744                 'thumbnail': None,
3745                 'description': None,
3746                 'player_url': None
3747             })
3748
3749         if self._downloader.params.get('listformats', None):
3750             self._print_formats(formats)
3751             return
3752
3753         req_format = self._downloader.params.get('format', None)
3754         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3755
3756         if req_format is None or req_format == 'best':
3757             return [formats[0]]
3758         elif req_format == 'worst':
3759             return [formats[-1]]
3760         elif req_format in ('-1', 'all'):
3761             return formats
3762         else:
3763             format = self._specific( req_format, formats )
3764             if result is None:
3765                 self._downloader.trouble(u'ERROR: requested format not available')
3766                 return
3767             return [format]
3768
3769
3770
3771 class PornotubeIE(InfoExtractor):
3772     """Information extractor for pornotube.com."""
3773     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3774
3775     def _real_extract(self, url):
3776         mobj = re.match(self._VALID_URL, url)
3777         if mobj is None:
3778             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3779             return
3780
3781         video_id = mobj.group('videoid')
3782         video_title = mobj.group('title')
3783
3784         # Get webpage content
3785         webpage = self._download_webpage(url, video_id)
3786
3787         # Get the video URL
3788         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3789         result = re.search(VIDEO_URL_RE, webpage)
3790         if result is None:
3791             self._downloader.trouble(u'ERROR: unable to extract video url')
3792             return
3793         video_url = compat_urllib_parse.unquote(result.group('url'))
3794
3795         #Get the uploaded date
3796         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3797         result = re.search(VIDEO_UPLOADED_RE, webpage)
3798         if result is None:
3799             self._downloader.trouble(u'ERROR: unable to extract video title')
3800             return
3801         upload_date = result.group('date')
3802
3803         info = {'id': video_id,
3804                 'url': video_url,
3805                 'uploader': None,
3806                 'upload_date': upload_date,
3807                 'title': video_title,
3808                 'ext': 'flv',
3809                 'format': 'flv'}
3810
3811         return [info]
3812
3813 class YouJizzIE(InfoExtractor):
3814     """Information extractor for youjizz.com."""
3815     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3816
3817     def _real_extract(self, url):
3818         mobj = re.match(self._VALID_URL, url)
3819         if mobj is None:
3820             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3821             return
3822
3823         video_id = mobj.group('videoid')
3824
3825         # Get webpage content
3826         webpage = self._download_webpage(url, video_id)
3827
3828         # Get the video title
3829         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3830         if result is None:
3831             raise ExtractorError(u'ERROR: unable to extract video title')
3832         video_title = result.group('title').strip()
3833
3834         # Get the embed page
3835         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3836         if result is None:
3837             raise ExtractorError(u'ERROR: unable to extract embed page')
3838
3839         embed_page_url = result.group(0).strip()
3840         video_id = result.group('videoid')
3841
3842         webpage = self._download_webpage(embed_page_url, video_id)
3843
3844         # Get the video URL
3845         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3846         if result is None:
3847             raise ExtractorError(u'ERROR: unable to extract video url')
3848         video_url = result.group('source')
3849
3850         info = {'id': video_id,
3851                 'url': video_url,
3852                 'title': video_title,
3853                 'ext': 'flv',
3854                 'format': 'flv',
3855                 'player_url': embed_page_url}
3856
3857         return [info]
3858
3859 class EightTracksIE(InfoExtractor):
3860     IE_NAME = '8tracks'
3861     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3862
3863     def _real_extract(self, url):
3864         mobj = re.match(self._VALID_URL, url)
3865         if mobj is None:
3866             raise ExtractorError(u'Invalid URL: %s' % url)
3867         playlist_id = mobj.group('id')
3868
3869         webpage = self._download_webpage(url, playlist_id)
3870
3871         m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3872         if not m:
3873             raise ExtractorError(u'Cannot find trax information')
3874         json_like = m.group(1)
3875         data = json.loads(json_like)
3876
3877         session = str(random.randint(0, 1000000000))
3878         mix_id = data['id']
3879         track_count = data['tracks_count']
3880         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3881         next_url = first_url
3882         res = []
3883         for i in itertools.count():
3884             api_json = self._download_webpage(next_url, playlist_id,
3885                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3886                 errnote=u'Failed to download song information')
3887             api_data = json.loads(api_json)
3888             track_data = api_data[u'set']['track']
3889             info = {
3890                 'id': track_data['id'],
3891                 'url': track_data['track_file_stream_url'],
3892                 'title': track_data['performer'] + u' - ' + track_data['name'],
3893                 'raw_title': track_data['name'],
3894                 'uploader_id': data['user']['login'],
3895                 'ext': 'm4a',
3896             }
3897             res.append(info)
3898             if api_data['set']['at_last_track']:
3899                 break
3900             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3901         return res
3902
3903 def gen_extractors():
3904     """ Return a list of an instance of every supported extractor.
3905     The order does matter; the first extractor matched is the one handling the URL.
3906     """
3907     return [
3908         YoutubePlaylistIE(),
3909         YoutubeChannelIE(),
3910         YoutubeUserIE(),
3911         YoutubeSearchIE(),
3912         YoutubeIE(),
3913         MetacafeIE(),
3914         DailymotionIE(),
3915         GoogleSearchIE(),
3916         PhotobucketIE(),
3917         YahooIE(),
3918         YahooSearchIE(),
3919         DepositFilesIE(),
3920         FacebookIE(),
3921         BlipTVUserIE(),
3922         BlipTVIE(),
3923         VimeoIE(),
3924         MyVideoIE(),
3925         ComedyCentralIE(),
3926         EscapistIE(),
3927         CollegeHumorIE(),
3928         XVideosIE(),
3929         SoundcloudIE(),
3930         InfoQIE(),
3931         MixcloudIE(),
3932         StanfordOpenClassroomIE(),
3933         MTVIE(),
3934         YoukuIE(),
3935         XNXXIE(),
3936         YouJizzIE(),
3937         PornotubeIE(),
3938         YouPornIE(),
3939         GooglePlusIE(),
3940         ArteTvIE(),
3941         NBAIE(),
3942         JustinTVIE(),
3943         FunnyOrDieIE(),
3944         TweetReelIE(),
3945         SteamIE(),
3946         UstreamIE(),
3947         RBMARadioIE(),
3948         EightTracksIE(),
3949         GenericIE()
3950     ]
3951
3952