_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.funnyordie import FunnyOrDieIE
  31 from .extractor.gametrailers import GametrailersIE
  32 from .extractor.generic import GenericIE
  33 from .extractor.googleplus import GooglePlusIE
  34 from .extractor.googlesearch import GoogleSearchIE
  35 from .extractor.infoq import InfoQIE
  36 from .extractor.justintv import JustinTVIE
  37 from .extractor.metacafe import MetacafeIE
  38 from .extractor.mixcloud import MixcloudIE
  39 from .extractor.mtv import MTVIE
  40 from .extractor.myvideo import MyVideoIE
  41 from .extractor.nba import NBAIE
  42 from .extractor.statigram import StatigramIE
  43 from .extractor.photobucket import PhotobucketIE
  44 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  45 from .extractor.stanfordoc import StanfordOpenClassroomIE
  46 from .extractor.steam import SteamIE
  47 from .extractor.ted import TEDIE
  48 from .extractor.vimeo import VimeoIE
  49 from .extractor.worldstarhiphop import WorldStarHipHopIE
  50 from .extractor.xnxx import XNXXIE
  51 from .extractor.xvideos import XVideosIE
  52 from .extractor.yahoo import YahooIE, YahooSearchIE
  53 from .extractor.youku import YoukuIE
  54 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  55 from .extractor.zdf import ZDFIE
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72 class UstreamIE(InfoExtractor):
  73     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
  74     IE_NAME = u'ustream'
  75
  76     def _real_extract(self, url):
  77         m = re.match(self._VALID_URL, url)
  78         video_id = m.group('videoID')
  79
  80         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
  81         webpage = self._download_webpage(url, video_id)
  82
  83         self.report_extraction(video_id)
  84
  85         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
  86             webpage, u'title')
  87
  88         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
  89             webpage, u'uploader', fatal=False, flags=re.DOTALL)
  90
  91         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
  92             webpage, u'thumbnail', fatal=False)
  93
  94         info = {
  95                 'id': video_id,
  96                 'url': video_url,
  97                 'ext': 'flv',
  98                 'title': video_title,
  99                 'uploader': uploader,
 100                 'thumbnail': thumbnail,
 101                }
 102         return info
 103
 104
 105 class RBMARadioIE(InfoExtractor):
 106     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 107
 108     def _real_extract(self, url):
 109         m = re.match(self._VALID_URL, url)
 110         video_id = m.group('videoID')
 111
 112         webpage = self._download_webpage(url, video_id)
 113
 114         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 115             webpage, u'json data', flags=re.MULTILINE)
 116
 117         try:
 118             data = json.loads(json_data)
 119         except ValueError as e:
 120             raise ExtractorError(u'Invalid JSON: ' + str(e))
 121
 122         video_url = data['akamai_url'] + '&cbr=256'
 123         url_parts = compat_urllib_parse_urlparse(video_url)
 124         video_ext = url_parts.path.rpartition('.')[2]
 125         info = {
 126                 'id': video_id,
 127                 'url': video_url,
 128                 'ext': video_ext,
 129                 'title': data['title'],
 130                 'description': data.get('teaser_text'),
 131                 'location': data.get('country_of_origin'),
 132                 'uploader': data.get('host', {}).get('name'),
 133                 'uploader_id': data.get('host', {}).get('slug'),
 134                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 135                 'duration': data.get('duration'),
 136         }
 137         return [info]
 138
 139
 140 class YouPornIE(InfoExtractor):
 141     """Information extractor for youporn.com."""
 142     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 143
 144     def _print_formats(self, formats):
 145         """Print all available formats"""
 146         print(u'Available formats:')
 147         print(u'ext\t\tformat')
 148         print(u'---------------------------------')
 149         for format in formats:
 150             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 151
 152     def _specific(self, req_format, formats):
 153         for x in formats:
 154             if(x["format"]==req_format):
 155                 return x
 156         return None
 157
 158     def _real_extract(self, url):
 159         mobj = re.match(self._VALID_URL, url)
 160         if mobj is None:
 161             raise ExtractorError(u'Invalid URL: %s' % url)
 162         video_id = mobj.group('videoid')
 163
 164         req = compat_urllib_request.Request(url)
 165         req.add_header('Cookie', 'age_verified=1')
 166         webpage = self._download_webpage(req, video_id)
 167
 168         # Get JSON parameters
 169         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 170         try:
 171             params = json.loads(json_params)
 172         except:
 173             raise ExtractorError(u'Invalid JSON')
 174
 175         self.report_extraction(video_id)
 176         try:
 177             video_title = params['title']
 178             upload_date = unified_strdate(params['release_date_f'])
 179             video_description = params['description']
 180             video_uploader = params['submitted_by']
 181             thumbnail = params['thumbnails'][0]['image']
 182         except KeyError:
 183             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 184
 185         # Get all of the formats available
 186         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 187         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 188             webpage, u'download list').strip()
 189
 190         # Get all of the links from the page
 191         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 192         links = re.findall(LINK_RE, download_list_html)
 193         if(len(links) == 0):
 194             raise ExtractorError(u'ERROR: no known formats available for video')
 195
 196         self.to_screen(u'Links found: %d' % len(links))
 197
 198         formats = []
 199         for link in links:
 200
 201             # A link looks like this:
 202             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 203             # A path looks like this:
 204             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 205             video_url = unescapeHTML( link )
 206             path = compat_urllib_parse_urlparse( video_url ).path
 207             extension = os.path.splitext( path )[1][1:]
 208             format = path.split('/')[4].split('_')[:2]
 209             size = format[0]
 210             bitrate = format[1]
 211             format = "-".join( format )
 212             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 213
 214             formats.append({
 215                 'id': video_id,
 216                 'url': video_url,
 217                 'uploader': video_uploader,
 218                 'upload_date': upload_date,
 219                 'title': video_title,
 220                 'ext': extension,
 221                 'format': format,
 222                 'thumbnail': thumbnail,
 223                 'description': video_description
 224             })
 225
 226         if self._downloader.params.get('listformats', None):
 227             self._print_formats(formats)
 228             return
 229
 230         req_format = self._downloader.params.get('format', None)
 231         self.to_screen(u'Format: %s' % req_format)
 232
 233         if req_format is None or req_format == 'best':
 234             return [formats[0]]
 235         elif req_format == 'worst':
 236             return [formats[-1]]
 237         elif req_format in ('-1', 'all'):
 238             return formats
 239         else:
 240             format = self._specific( req_format, formats )
 241             if result is None:
 242                 raise ExtractorError(u'Requested format not available')
 243             return [format]
 244
 245
 246
 247 class PornotubeIE(InfoExtractor):
 248     """Information extractor for pornotube.com."""
 249     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 250
 251     def _real_extract(self, url):
 252         mobj = re.match(self._VALID_URL, url)
 253         if mobj is None:
 254             raise ExtractorError(u'Invalid URL: %s' % url)
 255
 256         video_id = mobj.group('videoid')
 257         video_title = mobj.group('title')
 258
 259         # Get webpage content
 260         webpage = self._download_webpage(url, video_id)
 261
 262         # Get the video URL
 263         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 264         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 265         video_url = compat_urllib_parse.unquote(video_url)
 266
 267         #Get the uploaded date
 268         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 269         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 270         if upload_date: upload_date = unified_strdate(upload_date)
 271
 272         info = {'id': video_id,
 273                 'url': video_url,
 274                 'uploader': None,
 275                 'upload_date': upload_date,
 276                 'title': video_title,
 277                 'ext': 'flv',
 278                 'format': 'flv'}
 279
 280         return [info]
 281
 282 class YouJizzIE(InfoExtractor):
 283     """Information extractor for youjizz.com."""
 284     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 285
 286     def _real_extract(self, url):
 287         mobj = re.match(self._VALID_URL, url)
 288         if mobj is None:
 289             raise ExtractorError(u'Invalid URL: %s' % url)
 290
 291         video_id = mobj.group('videoid')
 292
 293         # Get webpage content
 294         webpage = self._download_webpage(url, video_id)
 295
 296         # Get the video title
 297         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 298             webpage, u'title').strip()
 299
 300         # Get the embed page
 301         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 302         if result is None:
 303             raise ExtractorError(u'ERROR: unable to extract embed page')
 304
 305         embed_page_url = result.group(0).strip()
 306         video_id = result.group('videoid')
 307
 308         webpage = self._download_webpage(embed_page_url, video_id)
 309
 310         # Get the video URL
 311         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 312             webpage, u'video URL')
 313
 314         info = {'id': video_id,
 315                 'url': video_url,
 316                 'title': video_title,
 317                 'ext': 'flv',
 318                 'format': 'flv',
 319                 'player_url': embed_page_url}
 320
 321         return [info]
 322
 323 class EightTracksIE(InfoExtractor):
 324     IE_NAME = '8tracks'
 325     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 326
 327     def _real_extract(self, url):
 328         mobj = re.match(self._VALID_URL, url)
 329         if mobj is None:
 330             raise ExtractorError(u'Invalid URL: %s' % url)
 331         playlist_id = mobj.group('id')
 332
 333         webpage = self._download_webpage(url, playlist_id)
 334
 335         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 336         data = json.loads(json_like)
 337
 338         session = str(random.randint(0, 1000000000))
 339         mix_id = data['id']
 340         track_count = data['tracks_count']
 341         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 342         next_url = first_url
 343         res = []
 344         for i in itertools.count():
 345             api_json = self._download_webpage(next_url, playlist_id,
 346                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 347                 errnote=u'Failed to download song information')
 348             api_data = json.loads(api_json)
 349             track_data = api_data[u'set']['track']
 350             info = {
 351                 'id': track_data['id'],
 352                 'url': track_data['track_file_stream_url'],
 353                 'title': track_data['performer'] + u' - ' + track_data['name'],
 354                 'raw_title': track_data['name'],
 355                 'uploader_id': data['user']['login'],
 356                 'ext': 'm4a',
 357             }
 358             res.append(info)
 359             if api_data['set']['at_last_track']:
 360                 break
 361             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 362         return res
 363
 364 class KeekIE(InfoExtractor):
 365     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 366     IE_NAME = u'keek'
 367
 368     def _real_extract(self, url):
 369         m = re.match(self._VALID_URL, url)
 370         video_id = m.group('videoID')
 371
 372         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 373         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 374         webpage = self._download_webpage(url, video_id)
 375
 376         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 377             webpage, u'title')
 378
 379         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 380             webpage, u'uploader', fatal=False)
 381
 382         info = {
 383                 'id': video_id,
 384                 'url': video_url,
 385                 'ext': 'mp4',
 386                 'title': video_title,
 387                 'thumbnail': thumbnail,
 388                 'uploader': uploader
 389         }
 390         return [info]
 391
 392
 393 class MySpassIE(InfoExtractor):
 394     _VALID_URL = r'http://www.myspass.de/.*'
 395
 396     def _real_extract(self, url):
 397         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 398
 399         # video id is the last path element of the URL
 400         # usually there is a trailing slash, so also try the second but last
 401         url_path = compat_urllib_parse_urlparse(url).path
 402         url_parent_path, video_id = os.path.split(url_path)
 403         if not video_id:
 404             _, video_id = os.path.split(url_parent_path)
 405
 406         # get metadata
 407         metadata_url = META_DATA_URL_TEMPLATE % video_id
 408         metadata_text = self._download_webpage(metadata_url, video_id)
 409         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 410
 411         # extract values from metadata
 412         url_flv_el = metadata.find('url_flv')
 413         if url_flv_el is None:
 414             raise ExtractorError(u'Unable to extract download url')
 415         video_url = url_flv_el.text
 416         extension = os.path.splitext(video_url)[1][1:]
 417         title_el = metadata.find('title')
 418         if title_el is None:
 419             raise ExtractorError(u'Unable to extract title')
 420         title = title_el.text
 421         format_id_el = metadata.find('format_id')
 422         if format_id_el is None:
 423             format = ext
 424         else:
 425             format = format_id_el.text
 426         description_el = metadata.find('description')
 427         if description_el is not None:
 428             description = description_el.text
 429         else:
 430             description = None
 431         imagePreview_el = metadata.find('imagePreview')
 432         if imagePreview_el is not None:
 433             thumbnail = imagePreview_el.text
 434         else:
 435             thumbnail = None
 436         info = {
 437             'id': video_id,
 438             'url': video_url,
 439             'title': title,
 440             'ext': extension,
 441             'format': format,
 442             'thumbnail': thumbnail,
 443             'description': description
 444         }
 445         return [info]
 446
 447 class SpiegelIE(InfoExtractor):
 448     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 449
 450     def _real_extract(self, url):
 451         m = re.match(self._VALID_URL, url)
 452         video_id = m.group('videoID')
 453
 454         webpage = self._download_webpage(url, video_id)
 455
 456         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 457             webpage, u'title')
 458
 459         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 460         xml_code = self._download_webpage(xml_url, video_id,
 461                     note=u'Downloading XML', errnote=u'Failed to download XML')
 462
 463         idoc = xml.etree.ElementTree.fromstring(xml_code)
 464         last_type = idoc[-1]
 465         filename = last_type.findall('./filename')[0].text
 466         duration = float(last_type.findall('./duration')[0].text)
 467
 468         video_url = 'http://video2.spiegel.de/flash/' + filename
 469         video_ext = filename.rpartition('.')[2]
 470         info = {
 471             'id': video_id,
 472             'url': video_url,
 473             'ext': video_ext,
 474             'title': video_title,
 475             'duration': duration,
 476         }
 477         return [info]
 478
 479 class LiveLeakIE(InfoExtractor):
 480
 481     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 482     IE_NAME = u'liveleak'
 483
 484     def _real_extract(self, url):
 485         mobj = re.match(self._VALID_URL, url)
 486         if mobj is None:
 487             raise ExtractorError(u'Invalid URL: %s' % url)
 488
 489         video_id = mobj.group('video_id')
 490
 491         webpage = self._download_webpage(url, video_id)
 492
 493         video_url = self._search_regex(r'file: "(.*?)",',
 494             webpage, u'video URL')
 495
 496         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 497             webpage, u'title').replace('LiveLeak.com -', '').strip()
 498
 499         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 500             webpage, u'description', fatal=False)
 501
 502         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 503             webpage, u'uploader', fatal=False)
 504
 505         info = {
 506             'id':  video_id,
 507             'url': video_url,
 508             'ext': 'mp4',
 509             'title': video_title,
 510             'description': video_description,
 511             'uploader': video_uploader
 512         }
 513
 514         return [info]
 515
 516
 517
 518 class TumblrIE(InfoExtractor):
 519     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 520
 521     def _real_extract(self, url):
 522         m_url = re.match(self._VALID_URL, url)
 523         video_id = m_url.group('id')
 524         blog = m_url.group('blog_name')
 525
 526         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 527         webpage = self._download_webpage(url, video_id)
 528
 529         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 530         video = re.search(re_video, webpage)
 531         if video is None:
 532            raise ExtractorError(u'Unable to extract video')
 533         video_url = video.group('video_url')
 534         ext = video.group('ext')
 535
 536         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 537             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 538         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 539
 540         # The only place where you can get a title, it's not complete,
 541         # but searching in other places doesn't work for all videos
 542         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 543             webpage, u'title', flags=re.DOTALL)
 544
 545         return [{'id': video_id,
 546                  'url': video_url,
 547                  'title': video_title,
 548                  'thumbnail': video_thumbnail,
 549                  'ext': ext
 550                  }]
 551
 552 class BandcampIE(InfoExtractor):
 553     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 554
 555     def _real_extract(self, url):
 556         mobj = re.match(self._VALID_URL, url)
 557         title = mobj.group('title')
 558         webpage = self._download_webpage(url, title)
 559         # We get the link to the free download page
 560         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 561         if m_download is None:
 562             raise ExtractorError(u'No free songs found')
 563
 564         download_link = m_download.group(1)
 565         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 566                        webpage, re.MULTILINE|re.DOTALL).group('id')
 567
 568         download_webpage = self._download_webpage(download_link, id,
 569                                                   'Downloading free downloads page')
 570         # We get the dictionary of the track from some javascrip code
 571         info = re.search(r'items: (.*?),$',
 572                          download_webpage, re.MULTILINE).group(1)
 573         info = json.loads(info)[0]
 574         # We pick mp3-320 for now, until format selection can be easily implemented.
 575         mp3_info = info[u'downloads'][u'mp3-320']
 576         # If we try to use this url it says the link has expired
 577         initial_url = mp3_info[u'url']
 578         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 579         m_url = re.match(re_url, initial_url)
 580         #We build the url we will use to get the final track url
 581         # This url is build in Bandcamp in the script download_bunde_*.js
 582         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 583         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 584         # If we could correctly generate the .rand field the url would be
 585         #in the "download_url" key
 586         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 587
 588         track_info = {'id':id,
 589                       'title' : info[u'title'],
 590                       'ext' :   'mp3',
 591                       'url' :   final_url,
 592                       'thumbnail' : info[u'thumb_url'],
 593                       'uploader' :  info[u'artist']
 594                       }
 595
 596         return [track_info]
 597
 598 class RedTubeIE(InfoExtractor):
 599     """Information Extractor for redtube"""
 600     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 601
 602     def _real_extract(self,url):
 603         mobj = re.match(self._VALID_URL, url)
 604         if mobj is None:
 605             raise ExtractorError(u'Invalid URL: %s' % url)
 606
 607         video_id = mobj.group('id')
 608         video_extension = 'mp4'
 609         webpage = self._download_webpage(url, video_id)
 610
 611         self.report_extraction(video_id)
 612
 613         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 614             webpage, u'video URL')
 615
 616         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 617             webpage, u'title')
 618
 619         return [{
 620             'id':       video_id,
 621             'url':      video_url,
 622             'ext':      video_extension,
 623             'title':    video_title,
 624         }]
 625
 626 class InaIE(InfoExtractor):
 627     """Information Extractor for Ina.fr"""
 628     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 629
 630     def _real_extract(self,url):
 631         mobj = re.match(self._VALID_URL, url)
 632
 633         video_id = mobj.group('id')
 634         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 635         video_extension = 'mp4'
 636         webpage = self._download_webpage(mrss_url, video_id)
 637
 638         self.report_extraction(video_id)
 639
 640         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 641             webpage, u'video URL')
 642
 643         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 644             webpage, u'title')
 645
 646         return [{
 647             'id':       video_id,
 648             'url':      video_url,
 649             'ext':      video_extension,
 650             'title':    video_title,
 651         }]
 652
 653 class HowcastIE(InfoExtractor):
 654     """Information Extractor for Howcast.com"""
 655     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 656
 657     def _real_extract(self, url):
 658         mobj = re.match(self._VALID_URL, url)
 659
 660         video_id = mobj.group('id')
 661         webpage_url = 'http://www.howcast.com/videos/' + video_id
 662         webpage = self._download_webpage(webpage_url, video_id)
 663
 664         self.report_extraction(video_id)
 665
 666         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 667             webpage, u'video URL')
 668
 669         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 670             webpage, u'title')
 671
 672         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 673             webpage, u'description', fatal=False)
 674
 675         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 676             webpage, u'thumbnail', fatal=False)
 677
 678         return [{
 679             'id':       video_id,
 680             'url':      video_url,
 681             'ext':      'mp4',
 682             'title':    video_title,
 683             'description': video_description,
 684             'thumbnail': thumbnail,
 685         }]
 686
 687 class VineIE(InfoExtractor):
 688     """Information Extractor for Vine.co"""
 689     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 690
 691     def _real_extract(self, url):
 692         mobj = re.match(self._VALID_URL, url)
 693
 694         video_id = mobj.group('id')
 695         webpage_url = 'https://vine.co/v/' + video_id
 696         webpage = self._download_webpage(webpage_url, video_id)
 697
 698         self.report_extraction(video_id)
 699
 700         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 701             webpage, u'video URL')
 702
 703         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 704             webpage, u'title')
 705
 706         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 707             webpage, u'thumbnail', fatal=False)
 708
 709         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 710             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 711
 712         return [{
 713             'id':        video_id,
 714             'url':       video_url,
 715             'ext':       'mp4',
 716             'title':     video_title,
 717             'thumbnail': thumbnail,
 718             'uploader':  uploader,
 719         }]
 720
 721 class FlickrIE(InfoExtractor):
 722     """Information Extractor for Flickr videos"""
 723     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 724
 725     def _real_extract(self, url):
 726         mobj = re.match(self._VALID_URL, url)
 727
 728         video_id = mobj.group('id')
 729         video_uploader_id = mobj.group('uploader_id')
 730         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 731         webpage = self._download_webpage(webpage_url, video_id)
 732
 733         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 734
 735         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 736         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 737
 738         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 739             first_xml, u'node_id')
 740
 741         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 742         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 743
 744         self.report_extraction(video_id)
 745
 746         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 747         if mobj is None:
 748             raise ExtractorError(u'Unable to extract video url')
 749         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 750
 751         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 752             webpage, u'video title')
 753
 754         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 755             webpage, u'description', fatal=False)
 756
 757         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 758             webpage, u'thumbnail', fatal=False)
 759
 760         return [{
 761             'id':          video_id,
 762             'url':         video_url,
 763             'ext':         'mp4',
 764             'title':       video_title,
 765             'description': video_description,
 766             'thumbnail':   thumbnail,
 767             'uploader_id': video_uploader_id,
 768         }]
 769
 770 class TeamcocoIE(InfoExtractor):
 771     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 772
 773     def _real_extract(self, url):
 774         mobj = re.match(self._VALID_URL, url)
 775         if mobj is None:
 776             raise ExtractorError(u'Invalid URL: %s' % url)
 777         url_title = mobj.group('url_title')
 778         webpage = self._download_webpage(url, url_title)
 779
 780         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 781             webpage, u'video id')
 782
 783         self.report_extraction(video_id)
 784
 785         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 786             webpage, u'title')
 787
 788         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 789             webpage, u'thumbnail', fatal=False)
 790
 791         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 792             webpage, u'description', fatal=False)
 793
 794         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 795         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 796
 797         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 798             data, u'video URL')
 799
 800         return [{
 801             'id':          video_id,
 802             'url':         video_url,
 803             'ext':         'mp4',
 804             'title':       video_title,
 805             'thumbnail':   thumbnail,
 806             'description': video_description,
 807         }]
 808
 809 class XHamsterIE(InfoExtractor):
 810     """Information Extractor for xHamster"""
 811     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 812
 813     def _real_extract(self,url):
 814         mobj = re.match(self._VALID_URL, url)
 815
 816         video_id = mobj.group('id')
 817         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 818         webpage = self._download_webpage(mrss_url, video_id)
 819
 820         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 821         if mobj is None:
 822             raise ExtractorError(u'Unable to extract media URL')
 823         if len(mobj.group('server')) == 0:
 824             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 825         else:
 826             video_url = mobj.group('server')+'/key='+mobj.group('file')
 827         video_extension = video_url.split('.')[-1]
 828
 829         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 830             webpage, u'title')
 831
 832         # Can't see the description anywhere in the UI
 833         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 834         #     webpage, u'description', fatal=False)
 835         # if video_description: video_description = unescapeHTML(video_description)
 836
 837         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 838         if mobj:
 839             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 840         else:
 841             video_upload_date = None
 842             self._downloader.report_warning(u'Unable to extract upload date')
 843
 844         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 845             webpage, u'uploader id', default=u'anonymous')
 846
 847         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 848             webpage, u'thumbnail', fatal=False)
 849
 850         return [{
 851             'id':       video_id,
 852             'url':      video_url,
 853             'ext':      video_extension,
 854             'title':    video_title,
 855             # 'description': video_description,
 856             'upload_date': video_upload_date,
 857             'uploader_id': video_uploader_id,
 858             'thumbnail': video_thumbnail
 859         }]
 860
 861 class HypemIE(InfoExtractor):
 862     """Information Extractor for hypem"""
 863     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 864
 865     def _real_extract(self, url):
 866         mobj = re.match(self._VALID_URL, url)
 867         if mobj is None:
 868             raise ExtractorError(u'Invalid URL: %s' % url)
 869         track_id = mobj.group(1)
 870
 871         data = { 'ax': 1, 'ts': time.time() }
 872         data_encoded = compat_urllib_parse.urlencode(data)
 873         complete_url = url + "?" + data_encoded
 874         request = compat_urllib_request.Request(complete_url)
 875         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 876         cookie = urlh.headers.get('Set-Cookie', '')
 877
 878         self.report_extraction(track_id)
 879
 880         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 881             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 882         try:
 883             track_list = json.loads(html_tracks)
 884             track = track_list[u'tracks'][0]
 885         except ValueError:
 886             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 887
 888         key = track[u"key"]
 889         track_id = track[u"id"]
 890         artist = track[u"artist"]
 891         title = track[u"song"]
 892
 893         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 894         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 895         request.add_header('cookie', cookie)
 896         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 897         try:
 898             song_data = json.loads(song_data_json)
 899         except ValueError:
 900             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 901         final_url = song_data[u"url"]
 902
 903         return [{
 904             'id':       track_id,
 905             'url':      final_url,
 906             'ext':      "mp3",
 907             'title':    title,
 908             'artist':   artist,
 909         }]
 910
 911 class Vbox7IE(InfoExtractor):
 912     """Information Extractor for Vbox7"""
 913     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 914
 915     def _real_extract(self,url):
 916         mobj = re.match(self._VALID_URL, url)
 917         if mobj is None:
 918             raise ExtractorError(u'Invalid URL: %s' % url)
 919         video_id = mobj.group(1)
 920
 921         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 922         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 923         redirect_url = urlh.geturl() + new_location
 924         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 925
 926         title = self._html_search_regex(r'<title>(.*)</title>',
 927             webpage, u'title').split('/')[0].strip()
 928
 929         ext = "flv"
 930         info_url = "http://vbox7.com/play/magare.do"
 931         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 932         info_request = compat_urllib_request.Request(info_url, data)
 933         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 934         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 935         if info_response is None:
 936             raise ExtractorError(u'Unable to extract the media url')
 937         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 938
 939         return [{
 940             'id':        video_id,
 941             'url':       final_url,
 942             'ext':       ext,
 943             'title':     title,
 944             'thumbnail': thumbnail_url,
 945         }]
 946
 947
 948 def gen_extractors():
 949     """ Return a list of an instance of every supported extractor.
 950     The order does matter; the first extractor matched is the one handling the URL.
 951     """
 952     return [
 953         YoutubePlaylistIE(),
 954         YoutubeChannelIE(),
 955         YoutubeUserIE(),
 956         YoutubeSearchIE(),
 957         YoutubeIE(),
 958         MetacafeIE(),
 959         DailymotionIE(),
 960         GoogleSearchIE(),
 961         PhotobucketIE(),
 962         YahooIE(),
 963         YahooSearchIE(),
 964         DepositFilesIE(),
 965         FacebookIE(),
 966         BlipTVIE(),
 967         BlipTVUserIE(),
 968         VimeoIE(),
 969         MyVideoIE(),
 970         ComedyCentralIE(),
 971         EscapistIE(),
 972         CollegeHumorIE(),
 973         XVideosIE(),
 974         SoundcloudSetIE(),
 975         SoundcloudIE(),
 976         InfoQIE(),
 977         MixcloudIE(),
 978         StanfordOpenClassroomIE(),
 979         MTVIE(),
 980         YoukuIE(),
 981         XNXXIE(),
 982         YouJizzIE(),
 983         PornotubeIE(),
 984         YouPornIE(),
 985         GooglePlusIE(),
 986         ArteTvIE(),
 987         NBAIE(),
 988         WorldStarHipHopIE(),
 989         JustinTVIE(),
 990         FunnyOrDieIE(),
 991         SteamIE(),
 992         UstreamIE(),
 993         RBMARadioIE(),
 994         EightTracksIE(),
 995         KeekIE(),
 996         TEDIE(),
 997         MySpassIE(),
 998         SpiegelIE(),
 999         LiveLeakIE(),
1000         ARDIE(),
1001         ZDFIE(),
1002         TumblrIE(),
1003         BandcampIE(),
1004         RedTubeIE(),
1005         InaIE(),
1006         HowcastIE(),
1007         VineIE(),
1008         FlickrIE(),
1009         TeamcocoIE(),
1010         XHamsterIE(),
1011         HypemIE(),
1012         Vbox7IE(),
1013         GametrailersIE(),
1014         StatigramIE(),
1015         GenericIE()
1016     ]
1017
1018 def get_info_extractor(ie_name):
1019     """Returns the info extractor class with the given ie_name"""
1020     return globals()[ie_name+'IE']