68e21c63542c136dcd5d5b38cd2261414a052613
[youtube-dl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24
25
26 from .extractor.common import InfoExtractor, SearchInfoExtractor
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.metacafe import MetacafeIE
29 from .extractor.statigram import StatigramIE
30 from .extractor.photobucket import PhotobucketIE
31 from .extractor.yahoo import YahooIE
32 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
33
34
35
36
37
38
39
40 class VimeoIE(InfoExtractor):
41     """Information extractor for vimeo.com."""
42
43     # _VALID_URL matches Vimeo URLs
44     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
45     IE_NAME = u'vimeo'
46
47     def _verify_video_password(self, url, video_id, webpage):
48         password = self._downloader.params.get('password', None)
49         if password is None:
50             raise ExtractorError(u'This video is protected by a password, use the --password option')
51         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
52         data = compat_urllib_parse.urlencode({'password': password,
53                                               'token': token})
54         # I didn't manage to use the password with https
55         if url.startswith('https'):
56             pass_url = url.replace('https','http')
57         else:
58             pass_url = url
59         password_request = compat_urllib_request.Request(pass_url+'/password', data)
60         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
61         password_request.add_header('Cookie', 'xsrft=%s' % token)
62         pass_web = self._download_webpage(password_request, video_id,
63                                           u'Verifying the password',
64                                           u'Wrong password')
65
66     def _real_extract(self, url, new_video=True):
67         # Extract ID from URL
68         mobj = re.match(self._VALID_URL, url)
69         if mobj is None:
70             raise ExtractorError(u'Invalid URL: %s' % url)
71
72         video_id = mobj.group('id')
73         if not mobj.group('proto'):
74             url = 'https://' + url
75         if mobj.group('direct_link') or mobj.group('pro'):
76             url = 'https://vimeo.com/' + video_id
77
78         # Retrieve video webpage to extract further information
79         request = compat_urllib_request.Request(url, None, std_headers)
80         webpage = self._download_webpage(request, video_id)
81
82         # Now we begin extracting as much information as we can from what we
83         # retrieved. First we extract the information common to all extractors,
84         # and latter we extract those that are Vimeo specific.
85         self.report_extraction(video_id)
86
87         # Extract the config JSON
88         try:
89             config = webpage.split(' = {config:')[1].split(',assets:')[0]
90             config = json.loads(config)
91         except:
92             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
93                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
94
95             if re.search('If so please provide the correct password.', webpage):
96                 self._verify_video_password(url, video_id, webpage)
97                 return self._real_extract(url)
98             else:
99                 raise ExtractorError(u'Unable to extract info section')
100
101         # Extract title
102         video_title = config["video"]["title"]
103
104         # Extract uploader and uploader_id
105         video_uploader = config["video"]["owner"]["name"]
106         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
107
108         # Extract video thumbnail
109         video_thumbnail = config["video"]["thumbnail"]
110
111         # Extract video description
112         video_description = get_element_by_attribute("itemprop", "description", webpage)
113         if video_description: video_description = clean_html(video_description)
114         else: video_description = u''
115
116         # Extract upload date
117         video_upload_date = None
118         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
119         if mobj is not None:
120             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
121
122         # Vimeo specific: extract request signature and timestamp
123         sig = config['request']['signature']
124         timestamp = config['request']['timestamp']
125
126         # Vimeo specific: extract video codec and quality information
127         # First consider quality, then codecs, then take everything
128         # TODO bind to format param
129         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
130         files = { 'hd': [], 'sd': [], 'other': []}
131         for codec_name, codec_extension in codecs:
132             if codec_name in config["video"]["files"]:
133                 if 'hd' in config["video"]["files"][codec_name]:
134                     files['hd'].append((codec_name, codec_extension, 'hd'))
135                 elif 'sd' in config["video"]["files"][codec_name]:
136                     files['sd'].append((codec_name, codec_extension, 'sd'))
137                 else:
138                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
139
140         for quality in ('hd', 'sd', 'other'):
141             if len(files[quality]) > 0:
142                 video_quality = files[quality][0][2]
143                 video_codec = files[quality][0][0]
144                 video_extension = files[quality][0][1]
145                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
146                 break
147         else:
148             raise ExtractorError(u'No known codec found')
149
150         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
151                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
152
153         return [{
154             'id':       video_id,
155             'url':      video_url,
156             'uploader': video_uploader,
157             'uploader_id': video_uploader_id,
158             'upload_date':  video_upload_date,
159             'title':    video_title,
160             'ext':      video_extension,
161             'thumbnail':    video_thumbnail,
162             'description':  video_description,
163         }]
164
165
166 class ArteTvIE(InfoExtractor):
167     """arte.tv information extractor."""
168
169     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
170     _LIVE_URL = r'index-[0-9]+\.html$'
171
172     IE_NAME = u'arte.tv'
173
174     def fetch_webpage(self, url):
175         request = compat_urllib_request.Request(url)
176         try:
177             self.report_download_webpage(url)
178             webpage = compat_urllib_request.urlopen(request).read()
179         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
180             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
181         except ValueError as err:
182             raise ExtractorError(u'Invalid URL: %s' % url)
183         return webpage
184
185     def grep_webpage(self, url, regex, regexFlags, matchTuples):
186         page = self.fetch_webpage(url)
187         mobj = re.search(regex, page, regexFlags)
188         info = {}
189
190         if mobj is None:
191             raise ExtractorError(u'Invalid URL: %s' % url)
192
193         for (i, key, err) in matchTuples:
194             if mobj.group(i) is None:
195                 raise ExtractorError(err)
196             else:
197                 info[key] = mobj.group(i)
198
199         return info
200
201     def extractLiveStream(self, url):
202         video_lang = url.split('/')[-4]
203         info = self.grep_webpage(
204             url,
205             r'src="(.*?/videothek_js.*?\.js)',
206             0,
207             [
208                 (1, 'url', u'Invalid URL: %s' % url)
209             ]
210         )
211         http_host = url.split('/')[2]
212         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
213         info = self.grep_webpage(
214             next_url,
215             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
216                 '(http://.*?\.swf).*?' +
217                 '(rtmp://.*?)\'',
218             re.DOTALL,
219             [
220                 (1, 'path',   u'could not extract video path: %s' % url),
221                 (2, 'player', u'could not extract video player: %s' % url),
222                 (3, 'url',    u'could not extract video url: %s' % url)
223             ]
224         )
225         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
226
227     def extractPlus7Stream(self, url):
228         video_lang = url.split('/')[-3]
229         info = self.grep_webpage(
230             url,
231             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
232             0,
233             [
234                 (1, 'url', u'Invalid URL: %s' % url)
235             ]
236         )
237         next_url = compat_urllib_parse.unquote(info.get('url'))
238         info = self.grep_webpage(
239             next_url,
240             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
241             0,
242             [
243                 (1, 'url', u'Could not find <video> tag: %s' % url)
244             ]
245         )
246         next_url = compat_urllib_parse.unquote(info.get('url'))
247
248         info = self.grep_webpage(
249             next_url,
250             r'<video id="(.*?)".*?>.*?' +
251                 '<name>(.*?)</name>.*?' +
252                 '<dateVideo>(.*?)</dateVideo>.*?' +
253                 '<url quality="hd">(.*?)</url>',
254             re.DOTALL,
255             [
256                 (1, 'id',    u'could not extract video id: %s' % url),
257                 (2, 'title', u'could not extract video title: %s' % url),
258                 (3, 'date',  u'could not extract video date: %s' % url),
259                 (4, 'url',   u'could not extract video url: %s' % url)
260             ]
261         )
262
263         return {
264             'id':           info.get('id'),
265             'url':          compat_urllib_parse.unquote(info.get('url')),
266             'uploader':     u'arte.tv',
267             'upload_date':  unified_strdate(info.get('date')),
268             'title':        info.get('title').decode('utf-8'),
269             'ext':          u'mp4',
270             'format':       u'NA',
271             'player_url':   None,
272         }
273
274     def _real_extract(self, url):
275         video_id = url.split('/')[-1]
276         self.report_extraction(video_id)
277
278         if re.search(self._LIVE_URL, video_id) is not None:
279             self.extractLiveStream(url)
280             return
281         else:
282             info = self.extractPlus7Stream(url)
283
284         return [info]
285
286
287 class GenericIE(InfoExtractor):
288     """Generic last-resort information extractor."""
289
290     _VALID_URL = r'.*'
291     IE_NAME = u'generic'
292
293     def report_download_webpage(self, video_id):
294         """Report webpage download."""
295         if not self._downloader.params.get('test', False):
296             self._downloader.report_warning(u'Falling back on generic information extractor.')
297         super(GenericIE, self).report_download_webpage(video_id)
298
299     def report_following_redirect(self, new_url):
300         """Report information extraction."""
301         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
302
303     def _test_redirect(self, url):
304         """Check if it is a redirect, like url shorteners, in case return the new url."""
305         class HeadRequest(compat_urllib_request.Request):
306             def get_method(self):
307                 return "HEAD"
308
309         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
310             """
311             Subclass the HTTPRedirectHandler to make it use our
312             HeadRequest also on the redirected URL
313             """
314             def redirect_request(self, req, fp, code, msg, headers, newurl):
315                 if code in (301, 302, 303, 307):
316                     newurl = newurl.replace(' ', '%20')
317                     newheaders = dict((k,v) for k,v in req.headers.items()
318                                       if k.lower() not in ("content-length", "content-type"))
319                     return HeadRequest(newurl,
320                                        headers=newheaders,
321                                        origin_req_host=req.get_origin_req_host(),
322                                        unverifiable=True)
323                 else:
324                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
325
326         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
327             """
328             Fallback to GET if HEAD is not allowed (405 HTTP error)
329             """
330             def http_error_405(self, req, fp, code, msg, headers):
331                 fp.read()
332                 fp.close()
333
334                 newheaders = dict((k,v) for k,v in req.headers.items()
335                                   if k.lower() not in ("content-length", "content-type"))
336                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
337                                                  headers=newheaders,
338                                                  origin_req_host=req.get_origin_req_host(),
339                                                  unverifiable=True))
340
341         # Build our opener
342         opener = compat_urllib_request.OpenerDirector()
343         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
344                         HTTPMethodFallback, HEADRedirectHandler,
345                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
346             opener.add_handler(handler())
347
348         response = opener.open(HeadRequest(url))
349         if response is None:
350             raise ExtractorError(u'Invalid URL protocol')
351         new_url = response.geturl()
352
353         if url == new_url:
354             return False
355
356         self.report_following_redirect(new_url)
357         return new_url
358
359     def _real_extract(self, url):
360         new_url = self._test_redirect(url)
361         if new_url: return [self.url_result(new_url)]
362
363         video_id = url.split('/')[-1]
364         try:
365             webpage = self._download_webpage(url, video_id)
366         except ValueError as err:
367             # since this is the last-resort InfoExtractor, if
368             # this error is thrown, it'll be thrown here
369             raise ExtractorError(u'Invalid URL: %s' % url)
370
371         self.report_extraction(video_id)
372         # Start with something easy: JW Player in SWFObject
373         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
374         if mobj is None:
375             # Broaden the search a little bit
376             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
377         if mobj is None:
378             # Broaden the search a little bit: JWPlayer JS loader
379             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
380         if mobj is None:
381             # Try to find twitter cards info
382             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
383         if mobj is None:
384             # We look for Open Graph info:
385             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
386             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
387             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
388             if m_video_type is not None:
389                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
390         if mobj is None:
391             raise ExtractorError(u'Invalid URL: %s' % url)
392
393         # It's possible that one of the regexes
394         # matched, but returned an empty group:
395         if mobj.group(1) is None:
396             raise ExtractorError(u'Invalid URL: %s' % url)
397
398         video_url = compat_urllib_parse.unquote(mobj.group(1))
399         video_id = os.path.basename(video_url)
400
401         # here's a fun little line of code for you:
402         video_extension = os.path.splitext(video_id)[1][1:]
403         video_id = os.path.splitext(video_id)[0]
404
405         # it's tempting to parse this further, but you would
406         # have to take into account all the variations like
407         #   Video Title - Site Name
408         #   Site Name | Video Title
409         #   Video Title - Tagline | Site Name
410         # and so on and so forth; it's just not practical
411         video_title = self._html_search_regex(r'<title>(.*)</title>',
412             webpage, u'video title')
413
414         # video uploader is domain name
415         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
416             url, u'video uploader')
417
418         return [{
419             'id':       video_id,
420             'url':      video_url,
421             'uploader': video_uploader,
422             'upload_date':  None,
423             'title':    video_title,
424             'ext':      video_extension,
425         }]
426
427
428 class YoutubeSearchIE(SearchInfoExtractor):
429     """Information Extractor for YouTube search queries."""
430     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
431     _MAX_RESULTS = 1000
432     IE_NAME = u'youtube:search'
433     _SEARCH_KEY = 'ytsearch'
434
435     def report_download_page(self, query, pagenum):
436         """Report attempt to download search page with given number."""
437         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
438
439     def _get_n_results(self, query, n):
440         """Get a specified number of results for a query"""
441
442         video_ids = []
443         pagenum = 0
444         limit = n
445
446         while (50 * pagenum) < limit:
447             self.report_download_page(query, pagenum+1)
448             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
449             request = compat_urllib_request.Request(result_url)
450             try:
451                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
452             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
454             api_response = json.loads(data)['data']
455
456             if not 'items' in api_response:
457                 raise ExtractorError(u'[youtube] No video results')
458
459             new_ids = list(video['id'] for video in api_response['items'])
460             video_ids += new_ids
461
462             limit = min(n, api_response['totalItems'])
463             pagenum += 1
464
465         if len(video_ids) > n:
466             video_ids = video_ids[:n]
467         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
468         return self.playlist_result(videos, query)
469
470
471 class GoogleSearchIE(SearchInfoExtractor):
472     """Information Extractor for Google Video search queries."""
473     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
474     _MAX_RESULTS = 1000
475     IE_NAME = u'video.google:search'
476     _SEARCH_KEY = 'gvsearch'
477
478     def _get_n_results(self, query, n):
479         """Get a specified number of results for a query"""
480
481         res = {
482             '_type': 'playlist',
483             'id': query,
484             'entries': []
485         }
486
487         for pagenum in itertools.count(1):
488             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
489             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
490                                              note='Downloading result page ' + str(pagenum))
491
492             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
493                 e = {
494                     '_type': 'url',
495                     'url': mobj.group(1)
496                 }
497                 res['entries'].append(e)
498
499             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
500                 return res
501
502 class YahooSearchIE(SearchInfoExtractor):
503     """Information Extractor for Yahoo! Video search queries."""
504
505     _MAX_RESULTS = 1000
506     IE_NAME = u'screen.yahoo:search'
507     _SEARCH_KEY = 'yvsearch'
508
509     def _get_n_results(self, query, n):
510         """Get a specified number of results for a query"""
511
512         res = {
513             '_type': 'playlist',
514             'id': query,
515             'entries': []
516         }
517         for pagenum in itertools.count(0): 
518             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
519             webpage = self._download_webpage(result_url, query,
520                                              note='Downloading results page '+str(pagenum+1))
521             info = json.loads(webpage)
522             m = info[u'm']
523             results = info[u'results']
524
525             for (i, r) in enumerate(results):
526                 if (pagenum * 30) +i >= n:
527                     break
528                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
529                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
530                 res['entries'].append(e)
531             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
532                 break
533
534         return res
535
536
537 class BlipTVUserIE(InfoExtractor):
538     """Information Extractor for blip.tv users."""
539
540     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
541     _PAGE_SIZE = 12
542     IE_NAME = u'blip.tv:user'
543
544     def _real_extract(self, url):
545         # Extract username
546         mobj = re.match(self._VALID_URL, url)
547         if mobj is None:
548             raise ExtractorError(u'Invalid URL: %s' % url)
549
550         username = mobj.group(1)
551
552         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
553
554         page = self._download_webpage(url, username, u'Downloading user page')
555         mobj = re.search(r'data-users-id="([^"]+)"', page)
556         page_base = page_base % mobj.group(1)
557
558
559         # Download video ids using BlipTV Ajax calls. Result size per
560         # query is limited (currently to 12 videos) so we need to query
561         # page by page until there are no video ids - it means we got
562         # all of them.
563
564         video_ids = []
565         pagenum = 1
566
567         while True:
568             url = page_base + "&page=" + str(pagenum)
569             page = self._download_webpage(url, username,
570                                           u'Downloading video ids from page %d' % pagenum)
571
572             # Extract video identifiers
573             ids_in_page = []
574
575             for mobj in re.finditer(r'href="/([^"]+)"', page):
576                 if mobj.group(1) not in ids_in_page:
577                     ids_in_page.append(unescapeHTML(mobj.group(1)))
578
579             video_ids.extend(ids_in_page)
580
581             # A little optimization - if current page is not
582             # "full", ie. does not contain PAGE_SIZE video ids then
583             # we can assume that this page is the last one - there
584             # are no more ids on further pages - no need to query
585             # again.
586
587             if len(ids_in_page) < self._PAGE_SIZE:
588                 break
589
590             pagenum += 1
591
592         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
593         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
594         return [self.playlist_result(url_entries, playlist_title = username)]
595
596
597 class DepositFilesIE(InfoExtractor):
598     """Information extractor for depositfiles.com"""
599
600     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
601
602     def _real_extract(self, url):
603         file_id = url.split('/')[-1]
604         # Rebuild url in english locale
605         url = 'http://depositfiles.com/en/files/' + file_id
606
607         # Retrieve file webpage with 'Free download' button pressed
608         free_download_indication = { 'gateway_result' : '1' }
609         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
610         try:
611             self.report_download_webpage(file_id)
612             webpage = compat_urllib_request.urlopen(request).read()
613         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
614             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
615
616         # Search for the real file URL
617         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
618         if (mobj is None) or (mobj.group(1) is None):
619             # Try to figure out reason of the error.
620             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
621             if (mobj is not None) and (mobj.group(1) is not None):
622                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
623                 raise ExtractorError(u'%s' % restriction_message)
624             else:
625                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
626
627         file_url = mobj.group(1)
628         file_extension = os.path.splitext(file_url)[1][1:]
629
630         # Search for file title
631         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
632
633         return [{
634             'id':       file_id.decode('utf-8'),
635             'url':      file_url.decode('utf-8'),
636             'uploader': None,
637             'upload_date':  None,
638             'title':    file_title,
639             'ext':      file_extension.decode('utf-8'),
640         }]
641
642
643 class FacebookIE(InfoExtractor):
644     """Information Extractor for Facebook"""
645
646     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
647     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
648     _NETRC_MACHINE = 'facebook'
649     IE_NAME = u'facebook'
650
651     def report_login(self):
652         """Report attempt to log in."""
653         self.to_screen(u'Logging in')
654
655     def _real_initialize(self):
656         if self._downloader is None:
657             return
658
659         useremail = None
660         password = None
661         downloader_params = self._downloader.params
662
663         # Attempt to use provided username and password or .netrc data
664         if downloader_params.get('username', None) is not None:
665             useremail = downloader_params['username']
666             password = downloader_params['password']
667         elif downloader_params.get('usenetrc', False):
668             try:
669                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
670                 if info is not None:
671                     useremail = info[0]
672                     password = info[2]
673                 else:
674                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
675             except (IOError, netrc.NetrcParseError) as err:
676                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
677                 return
678
679         if useremail is None:
680             return
681
682         # Log in
683         login_form = {
684             'email': useremail,
685             'pass': password,
686             'login': 'Log+In'
687             }
688         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
689         try:
690             self.report_login()
691             login_results = compat_urllib_request.urlopen(request).read()
692             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
693                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
694                 return
695         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
696             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
697             return
698
699     def _real_extract(self, url):
700         mobj = re.match(self._VALID_URL, url)
701         if mobj is None:
702             raise ExtractorError(u'Invalid URL: %s' % url)
703         video_id = mobj.group('ID')
704
705         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
706         webpage = self._download_webpage(url, video_id)
707
708         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
709         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
710         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
711         if not m:
712             raise ExtractorError(u'Cannot parse data')
713         data = dict(json.loads(m.group(1)))
714         params_raw = compat_urllib_parse.unquote(data['params'])
715         params = json.loads(params_raw)
716         video_data = params['video_data'][0]
717         video_url = video_data.get('hd_src')
718         if not video_url:
719             video_url = video_data['sd_src']
720         if not video_url:
721             raise ExtractorError(u'Cannot find video URL')
722         video_duration = int(video_data['video_duration'])
723         thumbnail = video_data['thumbnail_src']
724
725         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
726             webpage, u'title')
727
728         info = {
729             'id': video_id,
730             'title': video_title,
731             'url': video_url,
732             'ext': 'mp4',
733             'duration': video_duration,
734             'thumbnail': thumbnail,
735         }
736         return [info]
737
738
739 class BlipTVIE(InfoExtractor):
740     """Information extractor for blip.tv"""
741
742     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
743     _URL_EXT = r'^.*\.([a-z0-9]+)$'
744     IE_NAME = u'blip.tv'
745
746     def report_direct_download(self, title):
747         """Report information extraction."""
748         self.to_screen(u'%s: Direct download detected' % title)
749
750     def _real_extract(self, url):
751         mobj = re.match(self._VALID_URL, url)
752         if mobj is None:
753             raise ExtractorError(u'Invalid URL: %s' % url)
754
755         # See https://github.com/rg3/youtube-dl/issues/857
756         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
757         if api_mobj is not None:
758             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
759         urlp = compat_urllib_parse_urlparse(url)
760         if urlp.path.startswith('/play/'):
761             request = compat_urllib_request.Request(url)
762             response = compat_urllib_request.urlopen(request)
763             redirecturl = response.geturl()
764             rurlp = compat_urllib_parse_urlparse(redirecturl)
765             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
766             url = 'http://blip.tv/a/a-' + file_id
767             return self._real_extract(url)
768
769
770         if '?' in url:
771             cchar = '&'
772         else:
773             cchar = '?'
774         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
775         request = compat_urllib_request.Request(json_url)
776         request.add_header('User-Agent', 'iTunes/10.6.1')
777         self.report_extraction(mobj.group(1))
778         info = None
779         try:
780             urlh = compat_urllib_request.urlopen(request)
781             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
782                 basename = url.split('/')[-1]
783                 title,ext = os.path.splitext(basename)
784                 title = title.decode('UTF-8')
785                 ext = ext.replace('.', '')
786                 self.report_direct_download(title)
787                 info = {
788                     'id': title,
789                     'url': url,
790                     'uploader': None,
791                     'upload_date': None,
792                     'title': title,
793                     'ext': ext,
794                     'urlhandle': urlh
795                 }
796         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
797             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
798         if info is None: # Regular URL
799             try:
800                 json_code_bytes = urlh.read()
801                 json_code = json_code_bytes.decode('utf-8')
802             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
803                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
804
805             try:
806                 json_data = json.loads(json_code)
807                 if 'Post' in json_data:
808                     data = json_data['Post']
809                 else:
810                     data = json_data
811
812                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
813                 video_url = data['media']['url']
814                 umobj = re.match(self._URL_EXT, video_url)
815                 if umobj is None:
816                     raise ValueError('Can not determine filename extension')
817                 ext = umobj.group(1)
818
819                 info = {
820                     'id': data['item_id'],
821                     'url': video_url,
822                     'uploader': data['display_name'],
823                     'upload_date': upload_date,
824                     'title': data['title'],
825                     'ext': ext,
826                     'format': data['media']['mimeType'],
827                     'thumbnail': data['thumbnailUrl'],
828                     'description': data['description'],
829                     'player_url': data['embedUrl'],
830                     'user_agent': 'iTunes/10.6.1',
831                 }
832             except (ValueError,KeyError) as err:
833                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
834
835         return [info]
836
837
838 class MyVideoIE(InfoExtractor):
839     """Information Extractor for myvideo.de."""
840
841     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
842     IE_NAME = u'myvideo'
843
844     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
845     # Released into the Public Domain by Tristan Fischer on 2013-05-19
846     # https://github.com/rg3/youtube-dl/pull/842
847     def __rc4crypt(self,data, key):
848         x = 0
849         box = list(range(256))
850         for i in list(range(256)):
851             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
852             box[i], box[x] = box[x], box[i]
853         x = 0
854         y = 0
855         out = ''
856         for char in data:
857             x = (x + 1) % 256
858             y = (y + box[x]) % 256
859             box[x], box[y] = box[y], box[x]
860             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
861         return out
862
863     def __md5(self,s):
864         return hashlib.md5(s).hexdigest().encode()
865
866     def _real_extract(self,url):
867         mobj = re.match(self._VALID_URL, url)
868         if mobj is None:
869             raise ExtractorError(u'invalid URL: %s' % url)
870
871         video_id = mobj.group(1)
872
873         GK = (
874           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
875           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
876           b'TnpsbA0KTVRkbU1tSTRNdz09'
877         )
878
879         # Get video webpage
880         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
881         webpage = self._download_webpage(webpage_url, video_id)
882
883         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
884         if mobj is not None:
885             self.report_extraction(video_id)
886             video_url = mobj.group(1) + '.flv'
887
888             video_title = self._html_search_regex('<title>([^<]+)</title>',
889                 webpage, u'title')
890
891             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
892
893             return [{
894                 'id':       video_id,
895                 'url':      video_url,
896                 'uploader': None,
897                 'upload_date':  None,
898                 'title':    video_title,
899                 'ext':      u'flv',
900             }]
901
902         # try encxml
903         mobj = re.search('var flashvars={(.+?)}', webpage)
904         if mobj is None:
905             raise ExtractorError(u'Unable to extract video')
906
907         params = {}
908         encxml = ''
909         sec = mobj.group(1)
910         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
911             if not a == '_encxml':
912                 params[a] = b
913             else:
914                 encxml = compat_urllib_parse.unquote(b)
915         if not params.get('domain'):
916             params['domain'] = 'www.myvideo.de'
917         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
918         if 'flash_playertype=MTV' in xmldata_url:
919             self._downloader.report_warning(u'avoiding MTV player')
920             xmldata_url = (
921                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
922                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
923             ) % video_id
924
925         # get enc data
926         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
927         enc_data_b = binascii.unhexlify(enc_data)
928         sk = self.__md5(
929             base64.b64decode(base64.b64decode(GK)) +
930             self.__md5(
931                 str(video_id).encode('utf-8')
932             )
933         )
934         dec_data = self.__rc4crypt(enc_data_b, sk)
935
936         # extracting infos
937         self.report_extraction(video_id)
938
939         video_url = None
940         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
941         if mobj:
942             video_url = compat_urllib_parse.unquote(mobj.group(1))
943             if 'myvideo2flash' in video_url:
944                 self._downloader.report_warning(u'forcing RTMPT ...')
945                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
946
947         if not video_url:
948             # extract non rtmp videos
949             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
950             if mobj is None:
951                 raise ExtractorError(u'unable to extract url')
952             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
953
954         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
955         video_file = compat_urllib_parse.unquote(video_file)
956
957         if not video_file.endswith('f4m'):
958             ppath, prefix = video_file.split('.')
959             video_playpath = '%s:%s' % (prefix, ppath)
960             video_hls_playlist = ''
961         else:
962             video_playpath = ''
963             video_hls_playlist = (
964                 video_filepath + video_file
965             ).replace('.f4m', '.m3u8')
966
967         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
968         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
969
970         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
971             webpage, u'title')
972
973         return [{
974             'id':                 video_id,
975             'url':                video_url,
976             'tc_url':             video_url,
977             'uploader':           None,
978             'upload_date':        None,
979             'title':              video_title,
980             'ext':                u'flv',
981             'play_path':          video_playpath,
982             'video_file':         video_file,
983             'video_hls_playlist': video_hls_playlist,
984             'player_url':         video_swfobj,
985         }]
986
987
988 class ComedyCentralIE(InfoExtractor):
989     """Information extractor for The Daily Show and Colbert Report """
990
991     # urls can be abbreviations like :thedailyshow or :colbert
992     # urls for episodes like:
993     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
994     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
995     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
996     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
997                       |(https?://)?(www\.)?
998                           (?P<showname>thedailyshow|colbertnation)\.com/
999                          (full-episodes/(?P<episode>.*)|
1000                           (?P<clip>
1001                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1002                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1003                      $"""
1004
1005     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1006
1007     _video_extensions = {
1008         '3500': 'mp4',
1009         '2200': 'mp4',
1010         '1700': 'mp4',
1011         '1200': 'mp4',
1012         '750': 'mp4',
1013         '400': 'mp4',
1014     }
1015     _video_dimensions = {
1016         '3500': '1280x720',
1017         '2200': '960x540',
1018         '1700': '768x432',
1019         '1200': '640x360',
1020         '750': '512x288',
1021         '400': '384x216',
1022     }
1023
1024     @classmethod
1025     def suitable(cls, url):
1026         """Receives a URL and returns True if suitable for this IE."""
1027         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1028
1029     def _print_formats(self, formats):
1030         print('Available formats:')
1031         for x in formats:
1032             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1033
1034
1035     def _real_extract(self, url):
1036         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1037         if mobj is None:
1038             raise ExtractorError(u'Invalid URL: %s' % url)
1039
1040         if mobj.group('shortname'):
1041             if mobj.group('shortname') in ('tds', 'thedailyshow'):
1042                 url = u'http://www.thedailyshow.com/full-episodes/'
1043             else:
1044                 url = u'http://www.colbertnation.com/full-episodes/'
1045             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1046             assert mobj is not None
1047
1048         if mobj.group('clip'):
1049             if mobj.group('showname') == 'thedailyshow':
1050                 epTitle = mobj.group('tdstitle')
1051             else:
1052                 epTitle = mobj.group('cntitle')
1053             dlNewest = False
1054         else:
1055             dlNewest = not mobj.group('episode')
1056             if dlNewest:
1057                 epTitle = mobj.group('showname')
1058             else:
1059                 epTitle = mobj.group('episode')
1060
1061         self.report_extraction(epTitle)
1062         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1063         if dlNewest:
1064             url = htmlHandle.geturl()
1065             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1066             if mobj is None:
1067                 raise ExtractorError(u'Invalid redirected URL: ' + url)
1068             if mobj.group('episode') == '':
1069                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1070             epTitle = mobj.group('episode')
1071
1072         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1073
1074         if len(mMovieParams) == 0:
1075             # The Colbert Report embeds the information in a without
1076             # a URL prefix; so extract the alternate reference
1077             # and then add the URL prefix manually.
1078
1079             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1080             if len(altMovieParams) == 0:
1081                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1082             else:
1083                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1084
1085         uri = mMovieParams[0][1]
1086         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1087         indexXml = self._download_webpage(indexUrl, epTitle,
1088                                           u'Downloading show index',
1089                                           u'unable to download episode index')
1090
1091         results = []
1092
1093         idoc = xml.etree.ElementTree.fromstring(indexXml)
1094         itemEls = idoc.findall('.//item')
1095         for partNum,itemEl in enumerate(itemEls):
1096             mediaId = itemEl.findall('./guid')[0].text
1097             shortMediaId = mediaId.split(':')[-1]
1098             showId = mediaId.split(':')[-2].replace('.com', '')
1099             officialTitle = itemEl.findall('./title')[0].text
1100             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1101
1102             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1103                         compat_urllib_parse.urlencode({'uri': mediaId}))
1104             configXml = self._download_webpage(configUrl, epTitle,
1105                                                u'Downloading configuration for %s' % shortMediaId)
1106
1107             cdoc = xml.etree.ElementTree.fromstring(configXml)
1108             turls = []
1109             for rendition in cdoc.findall('.//rendition'):
1110                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1111                 turls.append(finfo)
1112
1113             if len(turls) == 0:
1114                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1115                 continue
1116
1117             if self._downloader.params.get('listformats', None):
1118                 self._print_formats([i[0] for i in turls])
1119                 return
1120
1121             # For now, just pick the highest bitrate
1122             format,rtmp_video_url = turls[-1]
1123
1124             # Get the format arg from the arg stream
1125             req_format = self._downloader.params.get('format', None)
1126
1127             # Select format if we can find one
1128             for f,v in turls:
1129                 if f == req_format:
1130                     format, rtmp_video_url = f, v
1131                     break
1132
1133             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1134             if not m:
1135                 raise ExtractorError(u'Cannot transform RTMP url')
1136             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1137             video_url = base + m.group('finalid')
1138
1139             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1140             info = {
1141                 'id': shortMediaId,
1142                 'url': video_url,
1143                 'uploader': showId,
1144                 'upload_date': officialDate,
1145                 'title': effTitle,
1146                 'ext': 'mp4',
1147                 'format': format,
1148                 'thumbnail': None,
1149                 'description': officialTitle,
1150             }
1151             results.append(info)
1152
1153         return results
1154
1155
1156 class EscapistIE(InfoExtractor):
1157     """Information extractor for The Escapist """
1158
1159     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1160     IE_NAME = u'escapist'
1161
1162     def _real_extract(self, url):
1163         mobj = re.match(self._VALID_URL, url)
1164         if mobj is None:
1165             raise ExtractorError(u'Invalid URL: %s' % url)
1166         showName = mobj.group('showname')
1167         videoId = mobj.group('episode')
1168
1169         self.report_extraction(videoId)
1170         webpage = self._download_webpage(url, videoId)
1171
1172         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1173             webpage, u'description', fatal=False)
1174
1175         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1176             webpage, u'thumbnail', fatal=False)
1177
1178         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1179             webpage, u'player url')
1180
1181         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1182             webpage, u'player url').split(' : ')[-1]
1183
1184         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1185         configUrl = compat_urllib_parse.unquote(configUrl)
1186
1187         configJSON = self._download_webpage(configUrl, videoId,
1188                                             u'Downloading configuration',
1189                                             u'unable to download configuration')
1190
1191         # Technically, it's JavaScript, not JSON
1192         configJSON = configJSON.replace("'", '"')
1193
1194         try:
1195             config = json.loads(configJSON)
1196         except (ValueError,) as err:
1197             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1198
1199         playlist = config['playlist']
1200         videoUrl = playlist[1]['url']
1201
1202         info = {
1203             'id': videoId,
1204             'url': videoUrl,
1205             'uploader': showName,
1206             'upload_date': None,
1207             'title': title,
1208             'ext': 'mp4',
1209             'thumbnail': imgUrl,
1210             'description': videoDesc,
1211             'player_url': playerUrl,
1212         }
1213
1214         return [info]
1215
1216 class CollegeHumorIE(InfoExtractor):
1217     """Information extractor for collegehumor.com"""
1218
1219     _WORKING = False
1220     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1221     IE_NAME = u'collegehumor'
1222
1223     def report_manifest(self, video_id):
1224         """Report information extraction."""
1225         self.to_screen(u'%s: Downloading XML manifest' % video_id)
1226
1227     def _real_extract(self, url):
1228         mobj = re.match(self._VALID_URL, url)
1229         if mobj is None:
1230             raise ExtractorError(u'Invalid URL: %s' % url)
1231         video_id = mobj.group('videoid')
1232
1233         info = {
1234             'id': video_id,
1235             'uploader': None,
1236             'upload_date': None,
1237         }
1238
1239         self.report_extraction(video_id)
1240         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1241         try:
1242             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1244             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1245
1246         mdoc = xml.etree.ElementTree.fromstring(metaXml)
1247         try:
1248             videoNode = mdoc.findall('./video')[0]
1249             info['description'] = videoNode.findall('./description')[0].text
1250             info['title'] = videoNode.findall('./caption')[0].text
1251             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1252             manifest_url = videoNode.findall('./file')[0].text
1253         except IndexError:
1254             raise ExtractorError(u'Invalid metadata XML file')
1255
1256         manifest_url += '?hdcore=2.10.3'
1257         self.report_manifest(video_id)
1258         try:
1259             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1260         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1261             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1262
1263         adoc = xml.etree.ElementTree.fromstring(manifestXml)
1264         try:
1265             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1266             node_id = media_node.attrib['url']
1267             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1268         except IndexError as err:
1269             raise ExtractorError(u'Invalid manifest file')
1270
1271         url_pr = compat_urllib_parse_urlparse(manifest_url)
1272         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1273
1274         info['url'] = url
1275         info['ext'] = 'f4f'
1276         return [info]
1277
1278
1279 class XVideosIE(InfoExtractor):
1280     """Information extractor for xvideos.com"""
1281
1282     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1283     IE_NAME = u'xvideos'
1284
1285     def _real_extract(self, url):
1286         mobj = re.match(self._VALID_URL, url)
1287         if mobj is None:
1288             raise ExtractorError(u'Invalid URL: %s' % url)
1289         video_id = mobj.group(1)
1290
1291         webpage = self._download_webpage(url, video_id)
1292
1293         self.report_extraction(video_id)
1294
1295         # Extract video URL
1296         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1297             webpage, u'video URL'))
1298
1299         # Extract title
1300         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1301             webpage, u'title')
1302
1303         # Extract video thumbnail
1304         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1305             webpage, u'thumbnail', fatal=False)
1306
1307         info = {
1308             'id': video_id,
1309             'url': video_url,
1310             'uploader': None,
1311             'upload_date': None,
1312             'title': video_title,
1313             'ext': 'flv',
1314             'thumbnail': video_thumbnail,
1315             'description': None,
1316         }
1317
1318         return [info]
1319
1320
1321 class SoundcloudIE(InfoExtractor):
1322     """Information extractor for soundcloud.com
1323        To access the media, the uid of the song and a stream token
1324        must be extracted from the page source and the script must make
1325        a request to media.soundcloud.com/crossdomain.xml. Then
1326        the media can be grabbed by requesting from an url composed
1327        of the stream token and uid
1328      """
1329
1330     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1331     IE_NAME = u'soundcloud'
1332
1333     def report_resolve(self, video_id):
1334         """Report information extraction."""
1335         self.to_screen(u'%s: Resolving id' % video_id)
1336
1337     def _real_extract(self, url):
1338         mobj = re.match(self._VALID_URL, url)
1339         if mobj is None:
1340             raise ExtractorError(u'Invalid URL: %s' % url)
1341
1342         # extract uploader (which is in the url)
1343         uploader = mobj.group(1)
1344         # extract simple title (uploader + slug of song title)
1345         slug_title =  mobj.group(2)
1346         simple_title = uploader + u'-' + slug_title
1347         full_title = '%s/%s' % (uploader, slug_title)
1348
1349         self.report_resolve(full_title)
1350
1351         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1352         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1353         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1354
1355         info = json.loads(info_json)
1356         video_id = info['id']
1357         self.report_extraction(full_title)
1358
1359         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1360         stream_json = self._download_webpage(streams_url, full_title,
1361                                              u'Downloading stream definitions',
1362                                              u'unable to download stream definitions')
1363
1364         streams = json.loads(stream_json)
1365         mediaURL = streams['http_mp3_128_url']
1366         upload_date = unified_strdate(info['created_at'])
1367
1368         return [{
1369             'id':       info['id'],
1370             'url':      mediaURL,
1371             'uploader': info['user']['username'],
1372             'upload_date': upload_date,
1373             'title':    info['title'],
1374             'ext':      u'mp3',
1375             'description': info['description'],
1376         }]
1377
1378 class SoundcloudSetIE(InfoExtractor):
1379     """Information extractor for soundcloud.com sets
1380        To access the media, the uid of the song and a stream token
1381        must be extracted from the page source and the script must make
1382        a request to media.soundcloud.com/crossdomain.xml. Then
1383        the media can be grabbed by requesting from an url composed
1384        of the stream token and uid
1385      """
1386
1387     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1388     IE_NAME = u'soundcloud:set'
1389
1390     def report_resolve(self, video_id):
1391         """Report information extraction."""
1392         self.to_screen(u'%s: Resolving id' % video_id)
1393
1394     def _real_extract(self, url):
1395         mobj = re.match(self._VALID_URL, url)
1396         if mobj is None:
1397             raise ExtractorError(u'Invalid URL: %s' % url)
1398
1399         # extract uploader (which is in the url)
1400         uploader = mobj.group(1)
1401         # extract simple title (uploader + slug of song title)
1402         slug_title =  mobj.group(2)
1403         simple_title = uploader + u'-' + slug_title
1404         full_title = '%s/sets/%s' % (uploader, slug_title)
1405
1406         self.report_resolve(full_title)
1407
1408         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1409         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1410         info_json = self._download_webpage(resolv_url, full_title)
1411
1412         videos = []
1413         info = json.loads(info_json)
1414         if 'errors' in info:
1415             for err in info['errors']:
1416                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1417             return
1418
1419         self.report_extraction(full_title)
1420         for track in info['tracks']:
1421             video_id = track['id']
1422
1423             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1424             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1425
1426             self.report_extraction(video_id)
1427             streams = json.loads(stream_json)
1428             mediaURL = streams['http_mp3_128_url']
1429
1430             videos.append({
1431                 'id':       video_id,
1432                 'url':      mediaURL,
1433                 'uploader': track['user']['username'],
1434                 'upload_date':  unified_strdate(track['created_at']),
1435                 'title':    track['title'],
1436                 'ext':      u'mp3',
1437                 'description': track['description'],
1438             })
1439         return videos
1440
1441
1442 class InfoQIE(InfoExtractor):
1443     """Information extractor for infoq.com"""
1444     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1445
1446     def _real_extract(self, url):
1447         mobj = re.match(self._VALID_URL, url)
1448         if mobj is None:
1449             raise ExtractorError(u'Invalid URL: %s' % url)
1450
1451         webpage = self._download_webpage(url, video_id=url)
1452         self.report_extraction(url)
1453
1454         # Extract video URL
1455         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1456         if mobj is None:
1457             raise ExtractorError(u'Unable to extract video url')
1458         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1459         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1460
1461         # Extract title
1462         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1463             webpage, u'title')
1464
1465         # Extract description
1466         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1467             webpage, u'description', fatal=False)
1468
1469         video_filename = video_url.split('/')[-1]
1470         video_id, extension = video_filename.split('.')
1471
1472         info = {
1473             'id': video_id,
1474             'url': video_url,
1475             'uploader': None,
1476             'upload_date': None,
1477             'title': video_title,
1478             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1479             'thumbnail': None,
1480             'description': video_description,
1481         }
1482
1483         return [info]
1484
1485 class MixcloudIE(InfoExtractor):
1486     """Information extractor for www.mixcloud.com"""
1487
1488     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1489     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1490     IE_NAME = u'mixcloud'
1491
1492     def report_download_json(self, file_id):
1493         """Report JSON download."""
1494         self.to_screen(u'Downloading json')
1495
1496     def get_urls(self, jsonData, fmt, bitrate='best'):
1497         """Get urls from 'audio_formats' section in json"""
1498         file_url = None
1499         try:
1500             bitrate_list = jsonData[fmt]
1501             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1502                 bitrate = max(bitrate_list) # select highest
1503
1504             url_list = jsonData[fmt][bitrate]
1505         except TypeError: # we have no bitrate info.
1506             url_list = jsonData[fmt]
1507         return url_list
1508
1509     def check_urls(self, url_list):
1510         """Returns 1st active url from list"""
1511         for url in url_list:
1512             try:
1513                 compat_urllib_request.urlopen(url)
1514                 return url
1515             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1516                 url = None
1517
1518         return None
1519
1520     def _print_formats(self, formats):
1521         print('Available formats:')
1522         for fmt in formats.keys():
1523             for b in formats[fmt]:
1524                 try:
1525                     ext = formats[fmt][b][0]
1526                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1527                 except TypeError: # we have no bitrate info
1528                     ext = formats[fmt][0]
1529                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1530                     break
1531
1532     def _real_extract(self, url):
1533         mobj = re.match(self._VALID_URL, url)
1534         if mobj is None:
1535             raise ExtractorError(u'Invalid URL: %s' % url)
1536         # extract uploader & filename from url
1537         uploader = mobj.group(1).decode('utf-8')
1538         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1539
1540         # construct API request
1541         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1542         # retrieve .json file with links to files
1543         request = compat_urllib_request.Request(file_url)
1544         try:
1545             self.report_download_json(file_url)
1546             jsonData = compat_urllib_request.urlopen(request).read()
1547         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1548             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1549
1550         # parse JSON
1551         json_data = json.loads(jsonData)
1552         player_url = json_data['player_swf_url']
1553         formats = dict(json_data['audio_formats'])
1554
1555         req_format = self._downloader.params.get('format', None)
1556         bitrate = None
1557
1558         if self._downloader.params.get('listformats', None):
1559             self._print_formats(formats)
1560             return
1561
1562         if req_format is None or req_format == 'best':
1563             for format_param in formats.keys():
1564                 url_list = self.get_urls(formats, format_param)
1565                 # check urls
1566                 file_url = self.check_urls(url_list)
1567                 if file_url is not None:
1568                     break # got it!
1569         else:
1570             if req_format not in formats:
1571                 raise ExtractorError(u'Format is not available')
1572
1573             url_list = self.get_urls(formats, req_format)
1574             file_url = self.check_urls(url_list)
1575             format_param = req_format
1576
1577         return [{
1578             'id': file_id.decode('utf-8'),
1579             'url': file_url.decode('utf-8'),
1580             'uploader': uploader.decode('utf-8'),
1581             'upload_date': None,
1582             'title': json_data['name'],
1583             'ext': file_url.split('.')[-1].decode('utf-8'),
1584             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1585             'thumbnail': json_data['thumbnail_url'],
1586             'description': json_data['description'],
1587             'player_url': player_url.decode('utf-8'),
1588         }]
1589
1590 class StanfordOpenClassroomIE(InfoExtractor):
1591     """Information extractor for Stanford's Open ClassRoom"""
1592
1593     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1594     IE_NAME = u'stanfordoc'
1595
1596     def _real_extract(self, url):
1597         mobj = re.match(self._VALID_URL, url)
1598         if mobj is None:
1599             raise ExtractorError(u'Invalid URL: %s' % url)
1600
1601         if mobj.group('course') and mobj.group('video'): # A specific video
1602             course = mobj.group('course')
1603             video = mobj.group('video')
1604             info = {
1605                 'id': course + '_' + video,
1606                 'uploader': None,
1607                 'upload_date': None,
1608             }
1609
1610             self.report_extraction(info['id'])
1611             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1612             xmlUrl = baseUrl + video + '.xml'
1613             try:
1614                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1615             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1616                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1617             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1618             try:
1619                 info['title'] = mdoc.findall('./title')[0].text
1620                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1621             except IndexError:
1622                 raise ExtractorError(u'Invalid metadata XML file')
1623             info['ext'] = info['url'].rpartition('.')[2]
1624             return [info]
1625         elif mobj.group('course'): # A course page
1626             course = mobj.group('course')
1627             info = {
1628                 'id': course,
1629                 'type': 'playlist',
1630                 'uploader': None,
1631                 'upload_date': None,
1632             }
1633
1634             coursepage = self._download_webpage(url, info['id'],
1635                                         note='Downloading course info page',
1636                                         errnote='Unable to download course info page')
1637
1638             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1639
1640             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1641                 coursepage, u'description', fatal=False)
1642
1643             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1644             info['list'] = [
1645                 {
1646                     'type': 'reference',
1647                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1648                 }
1649                     for vpage in links]
1650             results = []
1651             for entry in info['list']:
1652                 assert entry['type'] == 'reference'
1653                 results += self.extract(entry['url'])
1654             return results
1655         else: # Root page
1656             info = {
1657                 'id': 'Stanford OpenClassroom',
1658                 'type': 'playlist',
1659                 'uploader': None,
1660                 'upload_date': None,
1661             }
1662
1663             self.report_download_webpage(info['id'])
1664             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1665             try:
1666                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1667             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1668                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1669
1670             info['title'] = info['id']
1671
1672             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1673             info['list'] = [
1674                 {
1675                     'type': 'reference',
1676                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1677                 }
1678                     for cpage in links]
1679
1680             results = []
1681             for entry in info['list']:
1682                 assert entry['type'] == 'reference'
1683                 results += self.extract(entry['url'])
1684             return results
1685
1686 class MTVIE(InfoExtractor):
1687     """Information extractor for MTV.com"""
1688
1689     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1690     IE_NAME = u'mtv'
1691
1692     def _real_extract(self, url):
1693         mobj = re.match(self._VALID_URL, url)
1694         if mobj is None:
1695             raise ExtractorError(u'Invalid URL: %s' % url)
1696         if not mobj.group('proto'):
1697             url = 'http://' + url
1698         video_id = mobj.group('videoid')
1699
1700         webpage = self._download_webpage(url, video_id)
1701
1702         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1703             webpage, u'song name', fatal=False)
1704
1705         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1706             webpage, u'title')
1707
1708         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1709             webpage, u'mtvn_uri', fatal=False)
1710
1711         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1712             webpage, u'content id', fatal=False)
1713
1714         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1715         self.report_extraction(video_id)
1716         request = compat_urllib_request.Request(videogen_url)
1717         try:
1718             metadataXml = compat_urllib_request.urlopen(request).read()
1719         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1721
1722         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1723         renditions = mdoc.findall('.//rendition')
1724
1725         # For now, always pick the highest quality.
1726         rendition = renditions[-1]
1727
1728         try:
1729             _,_,ext = rendition.attrib['type'].partition('/')
1730             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1731             video_url = rendition.find('./src').text
1732         except KeyError:
1733             raise ExtractorError('Invalid rendition field.')
1734
1735         info = {
1736             'id': video_id,
1737             'url': video_url,
1738             'uploader': performer,
1739             'upload_date': None,
1740             'title': video_title,
1741             'ext': ext,
1742             'format': format,
1743         }
1744
1745         return [info]
1746
1747
1748 class YoukuIE(InfoExtractor):
1749     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1750
1751     def _gen_sid(self):
1752         nowTime = int(time.time() * 1000)
1753         random1 = random.randint(1000,1998)
1754         random2 = random.randint(1000,9999)
1755
1756         return "%d%d%d" %(nowTime,random1,random2)
1757
1758     def _get_file_ID_mix_string(self, seed):
1759         mixed = []
1760         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1761         seed = float(seed)
1762         for i in range(len(source)):
1763             seed  =  (seed * 211 + 30031 ) % 65536
1764             index  =  math.floor(seed / 65536 * len(source) )
1765             mixed.append(source[int(index)])
1766             source.remove(source[int(index)])
1767         #return ''.join(mixed)
1768         return mixed
1769
1770     def _get_file_id(self, fileId, seed):
1771         mixed = self._get_file_ID_mix_string(seed)
1772         ids = fileId.split('*')
1773         realId = []
1774         for ch in ids:
1775             if ch:
1776                 realId.append(mixed[int(ch)])
1777         return ''.join(realId)
1778
1779     def _real_extract(self, url):
1780         mobj = re.match(self._VALID_URL, url)
1781         if mobj is None:
1782             raise ExtractorError(u'Invalid URL: %s' % url)
1783         video_id = mobj.group('ID')
1784
1785         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1786
1787         jsondata = self._download_webpage(info_url, video_id)
1788
1789         self.report_extraction(video_id)
1790         try:
1791             config = json.loads(jsondata)
1792
1793             video_title =  config['data'][0]['title']
1794             seed = config['data'][0]['seed']
1795
1796             format = self._downloader.params.get('format', None)
1797             supported_format = list(config['data'][0]['streamfileids'].keys())
1798
1799             if format is None or format == 'best':
1800                 if 'hd2' in supported_format:
1801                     format = 'hd2'
1802                 else:
1803                     format = 'flv'
1804                 ext = u'flv'
1805             elif format == 'worst':
1806                 format = 'mp4'
1807                 ext = u'mp4'
1808             else:
1809                 format = 'flv'
1810                 ext = u'flv'
1811
1812
1813             fileid = config['data'][0]['streamfileids'][format]
1814             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1815         except (UnicodeDecodeError, ValueError, KeyError):
1816             raise ExtractorError(u'Unable to extract info section')
1817
1818         files_info=[]
1819         sid = self._gen_sid()
1820         fileid = self._get_file_id(fileid, seed)
1821
1822         #column 8,9 of fileid represent the segment number
1823         #fileid[7:9] should be changed
1824         for index, key in enumerate(keys):
1825
1826             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1827             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1828
1829             info = {
1830                 'id': '%s_part%02d' % (video_id, index),
1831                 'url': download_url,
1832                 'uploader': None,
1833                 'upload_date': None,
1834                 'title': video_title,
1835                 'ext': ext,
1836             }
1837             files_info.append(info)
1838
1839         return files_info
1840
1841
1842 class XNXXIE(InfoExtractor):
1843     """Information extractor for xnxx.com"""
1844
1845     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1846     IE_NAME = u'xnxx'
1847     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1848     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1849     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1850
1851     def _real_extract(self, url):
1852         mobj = re.match(self._VALID_URL, url)
1853         if mobj is None:
1854             raise ExtractorError(u'Invalid URL: %s' % url)
1855         video_id = mobj.group(1)
1856
1857         # Get webpage content
1858         webpage = self._download_webpage(url, video_id)
1859
1860         video_url = self._search_regex(self.VIDEO_URL_RE,
1861             webpage, u'video URL')
1862         video_url = compat_urllib_parse.unquote(video_url)
1863
1864         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1865             webpage, u'title')
1866
1867         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1868             webpage, u'thumbnail', fatal=False)
1869
1870         return [{
1871             'id': video_id,
1872             'url': video_url,
1873             'uploader': None,
1874             'upload_date': None,
1875             'title': video_title,
1876             'ext': 'flv',
1877             'thumbnail': video_thumbnail,
1878             'description': None,
1879         }]
1880
1881
1882 class GooglePlusIE(InfoExtractor):
1883     """Information extractor for plus.google.com."""
1884
1885     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1886     IE_NAME = u'plus.google'
1887
1888     def _real_extract(self, url):
1889         # Extract id from URL
1890         mobj = re.match(self._VALID_URL, url)
1891         if mobj is None:
1892             raise ExtractorError(u'Invalid URL: %s' % url)
1893
1894         post_url = mobj.group(0)
1895         video_id = mobj.group(1)
1896
1897         video_extension = 'flv'
1898
1899         # Step 1, Retrieve post webpage to extract further information
1900         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1901
1902         self.report_extraction(video_id)
1903
1904         # Extract update date
1905         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1906             webpage, u'upload date', fatal=False)
1907         if upload_date:
1908             # Convert timestring to a format suitable for filename
1909             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1910             upload_date = upload_date.strftime('%Y%m%d')
1911
1912         # Extract uploader
1913         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1914             webpage, u'uploader', fatal=False)
1915
1916         # Extract title
1917         # Get the first line for title
1918         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1919             webpage, 'title', default=u'NA')
1920
1921         # Step 2, Stimulate clicking the image box to launch video
1922         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1923             webpage, u'video page URL')
1924         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1925
1926         # Extract video links on video page
1927         """Extract video links of all sizes"""
1928         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1929         mobj = re.findall(pattern, webpage)
1930         if len(mobj) == 0:
1931             raise ExtractorError(u'Unable to extract video links')
1932
1933         # Sort in resolution
1934         links = sorted(mobj)
1935
1936         # Choose the lowest of the sort, i.e. highest resolution
1937         video_url = links[-1]
1938         # Only get the url. The resolution part in the tuple has no use anymore
1939         video_url = video_url[-1]
1940         # Treat escaped \u0026 style hex
1941         try:
1942             video_url = video_url.decode("unicode_escape")
1943         except AttributeError: # Python 3
1944             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1945
1946
1947         return [{
1948             'id':       video_id,
1949             'url':      video_url,
1950             'uploader': uploader,
1951             'upload_date':  upload_date,
1952             'title':    video_title,
1953             'ext':      video_extension,
1954         }]
1955
1956 class NBAIE(InfoExtractor):
1957     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1958     IE_NAME = u'nba'
1959
1960     def _real_extract(self, url):
1961         mobj = re.match(self._VALID_URL, url)
1962         if mobj is None:
1963             raise ExtractorError(u'Invalid URL: %s' % url)
1964
1965         video_id = mobj.group(1)
1966
1967         webpage = self._download_webpage(url, video_id)
1968
1969         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1970
1971         shortened_video_id = video_id.rpartition('/')[2]
1972         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1973             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1974
1975         # It isn't there in the HTML it returns to us
1976         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1977
1978         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1979
1980         info = {
1981             'id': shortened_video_id,
1982             'url': video_url,
1983             'ext': 'mp4',
1984             'title': title,
1985             # 'uploader_date': uploader_date,
1986             'description': description,
1987         }
1988         return [info]
1989
1990 class JustinTVIE(InfoExtractor):
1991     """Information extractor for justin.tv and twitch.tv"""
1992     # TODO: One broadcast may be split into multiple videos. The key
1993     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1994     # starts at 1 and increases. Can we treat all parts as one video?
1995
1996     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1997         (?:
1998             (?P<channelid>[^/]+)|
1999             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2000             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2001         )
2002         /?(?:\#.*)?$
2003         """
2004     _JUSTIN_PAGE_LIMIT = 100
2005     IE_NAME = u'justin.tv'
2006
2007     def report_download_page(self, channel, offset):
2008         """Report attempt to download a single page of videos."""
2009         self.to_screen(u'%s: Downloading video information from %d to %d' %
2010                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2011
2012     # Return count of items, list of *valid* items
2013     def _parse_page(self, url, video_id):
2014         webpage = self._download_webpage(url, video_id,
2015                                          u'Downloading video info JSON',
2016                                          u'unable to download video info JSON')
2017
2018         response = json.loads(webpage)
2019         if type(response) != list:
2020             error_text = response.get('error', 'unknown error')
2021             raise ExtractorError(u'Justin.tv API: %s' % error_text)
2022         info = []
2023         for clip in response:
2024             video_url = clip['video_file_url']
2025             if video_url:
2026                 video_extension = os.path.splitext(video_url)[1][1:]
2027                 video_date = re.sub('-', '', clip['start_time'][:10])
2028                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2029                 video_id = clip['id']
2030                 video_title = clip.get('title', video_id)
2031                 info.append({
2032                     'id': video_id,
2033                     'url': video_url,
2034                     'title': video_title,
2035                     'uploader': clip.get('channel_name', video_uploader_id),
2036                     'uploader_id': video_uploader_id,
2037                     'upload_date': video_date,
2038                     'ext': video_extension,
2039                 })
2040         return (len(response), info)
2041
2042     def _real_extract(self, url):
2043         mobj = re.match(self._VALID_URL, url)
2044         if mobj is None:
2045             raise ExtractorError(u'invalid URL: %s' % url)
2046
2047         api_base = 'http://api.justin.tv'
2048         paged = False
2049         if mobj.group('channelid'):
2050             paged = True
2051             video_id = mobj.group('channelid')
2052             api = api_base + '/channel/archives/%s.json' % video_id
2053         elif mobj.group('chapterid'):
2054             chapter_id = mobj.group('chapterid')
2055
2056             webpage = self._download_webpage(url, chapter_id)
2057             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2058             if not m:
2059                 raise ExtractorError(u'Cannot find archive of a chapter')
2060             archive_id = m.group(1)
2061
2062             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2063             chapter_info_xml = self._download_webpage(api, chapter_id,
2064                                              note=u'Downloading chapter information',
2065                                              errnote=u'Chapter information download failed')
2066             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2067             for a in doc.findall('.//archive'):
2068                 if archive_id == a.find('./id').text:
2069                     break
2070             else:
2071                 raise ExtractorError(u'Could not find chapter in chapter information')
2072
2073             video_url = a.find('./video_file_url').text
2074             video_ext = video_url.rpartition('.')[2] or u'flv'
2075
2076             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2077             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2078                                    note='Downloading chapter metadata',
2079                                    errnote='Download of chapter metadata failed')
2080             chapter_info = json.loads(chapter_info_json)
2081
2082             bracket_start = int(doc.find('.//bracket_start').text)
2083             bracket_end = int(doc.find('.//bracket_end').text)
2084
2085             # TODO determine start (and probably fix up file)
2086             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2087             #video_url += u'?start=' + TODO:start_timestamp
2088             # bracket_start is 13290, but we want 51670615
2089             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2090                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2091
2092             info = {
2093                 'id': u'c' + chapter_id,
2094                 'url': video_url,
2095                 'ext': video_ext,
2096                 'title': chapter_info['title'],
2097                 'thumbnail': chapter_info['preview'],
2098                 'description': chapter_info['description'],
2099                 'uploader': chapter_info['channel']['display_name'],
2100                 'uploader_id': chapter_info['channel']['name'],
2101             }
2102             return [info]
2103         else:
2104             video_id = mobj.group('videoid')
2105             api = api_base + '/broadcast/by_archive/%s.json' % video_id
2106
2107         self.report_extraction(video_id)
2108
2109         info = []
2110         offset = 0
2111         limit = self._JUSTIN_PAGE_LIMIT
2112         while True:
2113             if paged:
2114                 self.report_download_page(video_id, offset)
2115             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2116             page_count, page_info = self._parse_page(page_url, video_id)
2117             info.extend(page_info)
2118             if not paged or page_count != limit:
2119                 break
2120             offset += limit
2121         return info
2122
2123 class FunnyOrDieIE(InfoExtractor):
2124     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2125
2126     def _real_extract(self, url):
2127         mobj = re.match(self._VALID_URL, url)
2128         if mobj is None:
2129             raise ExtractorError(u'invalid URL: %s' % url)
2130
2131         video_id = mobj.group('id')
2132         webpage = self._download_webpage(url, video_id)
2133
2134         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2135             webpage, u'video URL', flags=re.DOTALL)
2136
2137         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2138             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2139
2140         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2141             webpage, u'description', fatal=False, flags=re.DOTALL)
2142
2143         info = {
2144             'id': video_id,
2145             'url': video_url,
2146             'ext': 'mp4',
2147             'title': title,
2148             'description': video_description,
2149         }
2150         return [info]
2151
2152 class SteamIE(InfoExtractor):
2153     _VALID_URL = r"""http://store\.steampowered\.com/
2154                 (agecheck/)?
2155                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2156                 (?P<gameID>\d+)/?
2157                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2158                 """
2159     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2160     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2161
2162     @classmethod
2163     def suitable(cls, url):
2164         """Receives a URL and returns True if suitable for this IE."""
2165         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2166
2167     def _real_extract(self, url):
2168         m = re.match(self._VALID_URL, url, re.VERBOSE)
2169         gameID = m.group('gameID')
2170
2171         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2172         webpage = self._download_webpage(videourl, gameID)
2173
2174         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2175             videourl = self._AGECHECK_TEMPLATE % gameID
2176             self.report_age_confirmation()
2177             webpage = self._download_webpage(videourl, gameID)
2178
2179         self.report_extraction(gameID)
2180         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2181                                              webpage, 'game title')
2182
2183         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2184         mweb = re.finditer(urlRE, webpage)
2185         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2186         titles = re.finditer(namesRE, webpage)
2187         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2188         thumbs = re.finditer(thumbsRE, webpage)
2189         videos = []
2190         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2191             video_id = vid.group('videoID')
2192             title = vtitle.group('videoName')
2193             video_url = vid.group('videoURL')
2194             video_thumb = thumb.group('thumbnail')
2195             if not video_url:
2196                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2197             info = {
2198                 'id':video_id,
2199                 'url':video_url,
2200                 'ext': 'flv',
2201                 'title': unescapeHTML(title),
2202                 'thumbnail': video_thumb
2203                   }
2204             videos.append(info)
2205         return [self.playlist_result(videos, gameID, game_title)]
2206
2207 class UstreamIE(InfoExtractor):
2208     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2209     IE_NAME = u'ustream'
2210
2211     def _real_extract(self, url):
2212         m = re.match(self._VALID_URL, url)
2213         video_id = m.group('videoID')
2214
2215         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2216         webpage = self._download_webpage(url, video_id)
2217
2218         self.report_extraction(video_id)
2219
2220         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2221             webpage, u'title')
2222
2223         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2224             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2225
2226         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2227             webpage, u'thumbnail', fatal=False)
2228
2229         info = {
2230                 'id': video_id,
2231                 'url': video_url,
2232                 'ext': 'flv',
2233                 'title': video_title,
2234                 'uploader': uploader,
2235                 'thumbnail': thumbnail,
2236                }
2237         return info
2238
2239 class WorldStarHipHopIE(InfoExtractor):
2240     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2241     IE_NAME = u'WorldStarHipHop'
2242
2243     def _real_extract(self, url):
2244         m = re.match(self._VALID_URL, url)
2245         video_id = m.group('id')
2246
2247         webpage_src = self._download_webpage(url, video_id)
2248
2249         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2250             webpage_src, u'video URL')
2251
2252         if 'mp4' in video_url:
2253             ext = 'mp4'
2254         else:
2255             ext = 'flv'
2256
2257         video_title = self._html_search_regex(r"<title>(.*)</title>",
2258             webpage_src, u'title')
2259
2260         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2261         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2262             webpage_src, u'thumbnail', fatal=False)
2263
2264         if not thumbnail:
2265             _title = r"""candytitles.*>(.*)</span>"""
2266             mobj = re.search(_title, webpage_src)
2267             if mobj is not None:
2268                 video_title = mobj.group(1)
2269
2270         results = [{
2271                     'id': video_id,
2272                     'url' : video_url,
2273                     'title' : video_title,
2274                     'thumbnail' : thumbnail,
2275                     'ext' : ext,
2276                     }]
2277         return results
2278
2279 class RBMARadioIE(InfoExtractor):
2280     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2281
2282     def _real_extract(self, url):
2283         m = re.match(self._VALID_URL, url)
2284         video_id = m.group('videoID')
2285
2286         webpage = self._download_webpage(url, video_id)
2287
2288         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2289             webpage, u'json data', flags=re.MULTILINE)
2290
2291         try:
2292             data = json.loads(json_data)
2293         except ValueError as e:
2294             raise ExtractorError(u'Invalid JSON: ' + str(e))
2295
2296         video_url = data['akamai_url'] + '&cbr=256'
2297         url_parts = compat_urllib_parse_urlparse(video_url)
2298         video_ext = url_parts.path.rpartition('.')[2]
2299         info = {
2300                 'id': video_id,
2301                 'url': video_url,
2302                 'ext': video_ext,
2303                 'title': data['title'],
2304                 'description': data.get('teaser_text'),
2305                 'location': data.get('country_of_origin'),
2306                 'uploader': data.get('host', {}).get('name'),
2307                 'uploader_id': data.get('host', {}).get('slug'),
2308                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2309                 'duration': data.get('duration'),
2310         }
2311         return [info]
2312
2313
2314 class YouPornIE(InfoExtractor):
2315     """Information extractor for youporn.com."""
2316     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2317
2318     def _print_formats(self, formats):
2319         """Print all available formats"""
2320         print(u'Available formats:')
2321         print(u'ext\t\tformat')
2322         print(u'---------------------------------')
2323         for format in formats:
2324             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2325
2326     def _specific(self, req_format, formats):
2327         for x in formats:
2328             if(x["format"]==req_format):
2329                 return x
2330         return None
2331
2332     def _real_extract(self, url):
2333         mobj = re.match(self._VALID_URL, url)
2334         if mobj is None:
2335             raise ExtractorError(u'Invalid URL: %s' % url)
2336         video_id = mobj.group('videoid')
2337
2338         req = compat_urllib_request.Request(url)
2339         req.add_header('Cookie', 'age_verified=1')
2340         webpage = self._download_webpage(req, video_id)
2341
2342         # Get JSON parameters
2343         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2344         try:
2345             params = json.loads(json_params)
2346         except:
2347             raise ExtractorError(u'Invalid JSON')
2348
2349         self.report_extraction(video_id)
2350         try:
2351             video_title = params['title']
2352             upload_date = unified_strdate(params['release_date_f'])
2353             video_description = params['description']
2354             video_uploader = params['submitted_by']
2355             thumbnail = params['thumbnails'][0]['image']
2356         except KeyError:
2357             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2358
2359         # Get all of the formats available
2360         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2361         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2362             webpage, u'download list').strip()
2363
2364         # Get all of the links from the page
2365         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2366         links = re.findall(LINK_RE, download_list_html)
2367         if(len(links) == 0):
2368             raise ExtractorError(u'ERROR: no known formats available for video')
2369
2370         self.to_screen(u'Links found: %d' % len(links))
2371
2372         formats = []
2373         for link in links:
2374
2375             # A link looks like this:
2376             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2377             # A path looks like this:
2378             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2379             video_url = unescapeHTML( link )
2380             path = compat_urllib_parse_urlparse( video_url ).path
2381             extension = os.path.splitext( path )[1][1:]
2382             format = path.split('/')[4].split('_')[:2]
2383             size = format[0]
2384             bitrate = format[1]
2385             format = "-".join( format )
2386             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2387
2388             formats.append({
2389                 'id': video_id,
2390                 'url': video_url,
2391                 'uploader': video_uploader,
2392                 'upload_date': upload_date,
2393                 'title': video_title,
2394                 'ext': extension,
2395                 'format': format,
2396                 'thumbnail': thumbnail,
2397                 'description': video_description
2398             })
2399
2400         if self._downloader.params.get('listformats', None):
2401             self._print_formats(formats)
2402             return
2403
2404         req_format = self._downloader.params.get('format', None)
2405         self.to_screen(u'Format: %s' % req_format)
2406
2407         if req_format is None or req_format == 'best':
2408             return [formats[0]]
2409         elif req_format == 'worst':
2410             return [formats[-1]]
2411         elif req_format in ('-1', 'all'):
2412             return formats
2413         else:
2414             format = self._specific( req_format, formats )
2415             if result is None:
2416                 raise ExtractorError(u'Requested format not available')
2417             return [format]
2418
2419
2420
2421 class PornotubeIE(InfoExtractor):
2422     """Information extractor for pornotube.com."""
2423     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2424
2425     def _real_extract(self, url):
2426         mobj = re.match(self._VALID_URL, url)
2427         if mobj is None:
2428             raise ExtractorError(u'Invalid URL: %s' % url)
2429
2430         video_id = mobj.group('videoid')
2431         video_title = mobj.group('title')
2432
2433         # Get webpage content
2434         webpage = self._download_webpage(url, video_id)
2435
2436         # Get the video URL
2437         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2438         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2439         video_url = compat_urllib_parse.unquote(video_url)
2440
2441         #Get the uploaded date
2442         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2443         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2444         if upload_date: upload_date = unified_strdate(upload_date)
2445
2446         info = {'id': video_id,
2447                 'url': video_url,
2448                 'uploader': None,
2449                 'upload_date': upload_date,
2450                 'title': video_title,
2451                 'ext': 'flv',
2452                 'format': 'flv'}
2453
2454         return [info]
2455
2456 class YouJizzIE(InfoExtractor):
2457     """Information extractor for youjizz.com."""
2458     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2459
2460     def _real_extract(self, url):
2461         mobj = re.match(self._VALID_URL, url)
2462         if mobj is None:
2463             raise ExtractorError(u'Invalid URL: %s' % url)
2464
2465         video_id = mobj.group('videoid')
2466
2467         # Get webpage content
2468         webpage = self._download_webpage(url, video_id)
2469
2470         # Get the video title
2471         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2472             webpage, u'title').strip()
2473
2474         # Get the embed page
2475         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2476         if result is None:
2477             raise ExtractorError(u'ERROR: unable to extract embed page')
2478
2479         embed_page_url = result.group(0).strip()
2480         video_id = result.group('videoid')
2481
2482         webpage = self._download_webpage(embed_page_url, video_id)
2483
2484         # Get the video URL
2485         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2486             webpage, u'video URL')
2487
2488         info = {'id': video_id,
2489                 'url': video_url,
2490                 'title': video_title,
2491                 'ext': 'flv',
2492                 'format': 'flv',
2493                 'player_url': embed_page_url}
2494
2495         return [info]
2496
2497 class EightTracksIE(InfoExtractor):
2498     IE_NAME = '8tracks'
2499     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2500
2501     def _real_extract(self, url):
2502         mobj = re.match(self._VALID_URL, url)
2503         if mobj is None:
2504             raise ExtractorError(u'Invalid URL: %s' % url)
2505         playlist_id = mobj.group('id')
2506
2507         webpage = self._download_webpage(url, playlist_id)
2508
2509         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2510         data = json.loads(json_like)
2511
2512         session = str(random.randint(0, 1000000000))
2513         mix_id = data['id']
2514         track_count = data['tracks_count']
2515         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2516         next_url = first_url
2517         res = []
2518         for i in itertools.count():
2519             api_json = self._download_webpage(next_url, playlist_id,
2520                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2521                 errnote=u'Failed to download song information')
2522             api_data = json.loads(api_json)
2523             track_data = api_data[u'set']['track']
2524             info = {
2525                 'id': track_data['id'],
2526                 'url': track_data['track_file_stream_url'],
2527                 'title': track_data['performer'] + u' - ' + track_data['name'],
2528                 'raw_title': track_data['name'],
2529                 'uploader_id': data['user']['login'],
2530                 'ext': 'm4a',
2531             }
2532             res.append(info)
2533             if api_data['set']['at_last_track']:
2534                 break
2535             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2536         return res
2537
2538 class KeekIE(InfoExtractor):
2539     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2540     IE_NAME = u'keek'
2541
2542     def _real_extract(self, url):
2543         m = re.match(self._VALID_URL, url)
2544         video_id = m.group('videoID')
2545
2546         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2547         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2548         webpage = self._download_webpage(url, video_id)
2549
2550         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2551             webpage, u'title')
2552
2553         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2554             webpage, u'uploader', fatal=False)
2555
2556         info = {
2557                 'id': video_id,
2558                 'url': video_url,
2559                 'ext': 'mp4',
2560                 'title': video_title,
2561                 'thumbnail': thumbnail,
2562                 'uploader': uploader
2563         }
2564         return [info]
2565
2566 class TEDIE(InfoExtractor):
2567     _VALID_URL=r'''http://www\.ted\.com/
2568                    (
2569                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2570                         |
2571                         ((?P<type_talk>talks)) # We have a simple talk
2572                    )
2573                    (/lang/(.*?))? # The url may contain the language
2574                    /(?P<name>\w+) # Here goes the name and then ".html"
2575                    '''
2576
2577     @classmethod
2578     def suitable(cls, url):
2579         """Receives a URL and returns True if suitable for this IE."""
2580         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2581
2582     def _real_extract(self, url):
2583         m=re.match(self._VALID_URL, url, re.VERBOSE)
2584         if m.group('type_talk'):
2585             return [self._talk_info(url)]
2586         else :
2587             playlist_id=m.group('playlist_id')
2588             name=m.group('name')
2589             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2590             return [self._playlist_videos_info(url,name,playlist_id)]
2591
2592     def _playlist_videos_info(self,url,name,playlist_id=0):
2593         '''Returns the videos of the playlist'''
2594         video_RE=r'''
2595                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2596                      ([.\s]*?)data-playlist_item_id="(\d+)"
2597                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2598                      '''
2599         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2600         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2601         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2602         m_names=re.finditer(video_name_RE,webpage)
2603
2604         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2605                                                  webpage, 'playlist title')
2606
2607         playlist_entries = []
2608         for m_video, m_name in zip(m_videos,m_names):
2609             video_id=m_video.group('video_id')
2610             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2611             playlist_entries.append(self.url_result(talk_url, 'TED'))
2612         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2613
2614     def _talk_info(self, url, video_id=0):
2615         """Return the video for the talk in the url"""
2616         m = re.match(self._VALID_URL, url,re.VERBOSE)
2617         video_name = m.group('name')
2618         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2619         self.report_extraction(video_name)
2620         # If the url includes the language we get the title translated
2621         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2622                                         webpage, 'title')
2623         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2624                                     webpage, 'json data')
2625         info = json.loads(json_data)
2626         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2627                                        webpage, 'description', flags = re.DOTALL)
2628         
2629         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2630                                        webpage, 'thumbnail')
2631         info = {
2632                 'id': info['id'],
2633                 'url': info['htmlStreams'][-1]['file'],
2634                 'ext': 'mp4',
2635                 'title': title,
2636                 'thumbnail': thumbnail,
2637                 'description': desc,
2638                 }
2639         return info
2640
2641 class MySpassIE(InfoExtractor):
2642     _VALID_URL = r'http://www.myspass.de/.*'
2643
2644     def _real_extract(self, url):
2645         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2646
2647         # video id is the last path element of the URL
2648         # usually there is a trailing slash, so also try the second but last
2649         url_path = compat_urllib_parse_urlparse(url).path
2650         url_parent_path, video_id = os.path.split(url_path)
2651         if not video_id:
2652             _, video_id = os.path.split(url_parent_path)
2653
2654         # get metadata
2655         metadata_url = META_DATA_URL_TEMPLATE % video_id
2656         metadata_text = self._download_webpage(metadata_url, video_id)
2657         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2658
2659         # extract values from metadata
2660         url_flv_el = metadata.find('url_flv')
2661         if url_flv_el is None:
2662             raise ExtractorError(u'Unable to extract download url')
2663         video_url = url_flv_el.text
2664         extension = os.path.splitext(video_url)[1][1:]
2665         title_el = metadata.find('title')
2666         if title_el is None:
2667             raise ExtractorError(u'Unable to extract title')
2668         title = title_el.text
2669         format_id_el = metadata.find('format_id')
2670         if format_id_el is None:
2671             format = ext
2672         else:
2673             format = format_id_el.text
2674         description_el = metadata.find('description')
2675         if description_el is not None:
2676             description = description_el.text
2677         else:
2678             description = None
2679         imagePreview_el = metadata.find('imagePreview')
2680         if imagePreview_el is not None:
2681             thumbnail = imagePreview_el.text
2682         else:
2683             thumbnail = None
2684         info = {
2685             'id': video_id,
2686             'url': video_url,
2687             'title': title,
2688             'ext': extension,
2689             'format': format,
2690             'thumbnail': thumbnail,
2691             'description': description
2692         }
2693         return [info]
2694
2695 class SpiegelIE(InfoExtractor):
2696     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2697
2698     def _real_extract(self, url):
2699         m = re.match(self._VALID_URL, url)
2700         video_id = m.group('videoID')
2701
2702         webpage = self._download_webpage(url, video_id)
2703
2704         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2705             webpage, u'title')
2706
2707         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2708         xml_code = self._download_webpage(xml_url, video_id,
2709                     note=u'Downloading XML', errnote=u'Failed to download XML')
2710
2711         idoc = xml.etree.ElementTree.fromstring(xml_code)
2712         last_type = idoc[-1]
2713         filename = last_type.findall('./filename')[0].text
2714         duration = float(last_type.findall('./duration')[0].text)
2715
2716         video_url = 'http://video2.spiegel.de/flash/' + filename
2717         video_ext = filename.rpartition('.')[2]
2718         info = {
2719             'id': video_id,
2720             'url': video_url,
2721             'ext': video_ext,
2722             'title': video_title,
2723             'duration': duration,
2724         }
2725         return [info]
2726
2727 class LiveLeakIE(InfoExtractor):
2728
2729     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2730     IE_NAME = u'liveleak'
2731
2732     def _real_extract(self, url):
2733         mobj = re.match(self._VALID_URL, url)
2734         if mobj is None:
2735             raise ExtractorError(u'Invalid URL: %s' % url)
2736
2737         video_id = mobj.group('video_id')
2738
2739         webpage = self._download_webpage(url, video_id)
2740
2741         video_url = self._search_regex(r'file: "(.*?)",',
2742             webpage, u'video URL')
2743
2744         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2745             webpage, u'title').replace('LiveLeak.com -', '').strip()
2746
2747         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2748             webpage, u'description', fatal=False)
2749
2750         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2751             webpage, u'uploader', fatal=False)
2752
2753         info = {
2754             'id':  video_id,
2755             'url': video_url,
2756             'ext': 'mp4',
2757             'title': video_title,
2758             'description': video_description,
2759             'uploader': video_uploader
2760         }
2761
2762         return [info]
2763
2764 class ARDIE(InfoExtractor):
2765     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2766     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2767     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2768
2769     def _real_extract(self, url):
2770         # determine video id from url
2771         m = re.match(self._VALID_URL, url)
2772
2773         numid = re.search(r'documentId=([0-9]+)', url)
2774         if numid:
2775             video_id = numid.group(1)
2776         else:
2777             video_id = m.group('video_id')
2778
2779         # determine title and media streams from webpage
2780         html = self._download_webpage(url, video_id)
2781         title = re.search(self._TITLE, html).group('title')
2782         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2783         if not streams:
2784             assert '"fsk"' in html
2785             raise ExtractorError(u'This video is only available after 8:00 pm')
2786
2787         # choose default media type and highest quality for now
2788         stream = max([s for s in streams if int(s["media_type"]) == 0],
2789                      key=lambda s: int(s["quality"]))
2790
2791         # there's two possibilities: RTMP stream or HTTP download
2792         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2793         if stream['rtmp_url']:
2794             self.to_screen(u'RTMP download detected')
2795             assert stream['video_url'].startswith('mp4:')
2796             info["url"] = stream["rtmp_url"]
2797             info["play_path"] = stream['video_url']
2798         else:
2799             assert stream["video_url"].endswith('.mp4')
2800             info["url"] = stream["video_url"]
2801         return [info]
2802
2803 class ZDFIE(InfoExtractor):
2804     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2805     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2806     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2807     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2808     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2809
2810     def _real_extract(self, url):
2811         mobj = re.match(self._VALID_URL, url)
2812         if mobj is None:
2813             raise ExtractorError(u'Invalid URL: %s' % url)
2814         video_id = mobj.group('video_id')
2815
2816         html = self._download_webpage(url, video_id)
2817         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2818         if streams is None:
2819             raise ExtractorError(u'No media url found.')
2820
2821         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2822         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2823         # choose first/default media type and highest quality for now
2824         for s in streams:        #find 300 - dsl1000mbit
2825             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2826                 stream_=s
2827                 break
2828         for s in streams:        #find veryhigh - dsl2000mbit
2829             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2830                 stream_=s
2831                 break
2832         if stream_ is None:
2833             raise ExtractorError(u'No stream found.')
2834
2835         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2836
2837         self.report_extraction(video_id)
2838         mobj = re.search(self._TITLE, html)
2839         if mobj is None:
2840             raise ExtractorError(u'Cannot extract title')
2841         title = unescapeHTML(mobj.group('title'))
2842
2843         mobj = re.search(self._MMS_STREAM, media_link)
2844         if mobj is None:
2845             mobj = re.search(self._RTSP_STREAM, media_link)
2846             if mobj is None:
2847                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2848         mms_url = mobj.group('video_url')
2849
2850         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2851         if mobj is None:
2852             raise ExtractorError(u'Cannot extract extention')
2853         ext = mobj.group('ext')
2854
2855         return [{'id': video_id,
2856                  'url': mms_url,
2857                  'title': title,
2858                  'ext': ext
2859                  }]
2860
2861 class TumblrIE(InfoExtractor):
2862     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2863
2864     def _real_extract(self, url):
2865         m_url = re.match(self._VALID_URL, url)
2866         video_id = m_url.group('id')
2867         blog = m_url.group('blog_name')
2868
2869         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2870         webpage = self._download_webpage(url, video_id)
2871
2872         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2873         video = re.search(re_video, webpage)
2874         if video is None:
2875            raise ExtractorError(u'Unable to extract video')
2876         video_url = video.group('video_url')
2877         ext = video.group('ext')
2878
2879         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2880             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2881         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2882
2883         # The only place where you can get a title, it's not complete,
2884         # but searching in other places doesn't work for all videos
2885         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2886             webpage, u'title', flags=re.DOTALL)
2887
2888         return [{'id': video_id,
2889                  'url': video_url,
2890                  'title': video_title,
2891                  'thumbnail': video_thumbnail,
2892                  'ext': ext
2893                  }]
2894
2895 class BandcampIE(InfoExtractor):
2896     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2897
2898     def _real_extract(self, url):
2899         mobj = re.match(self._VALID_URL, url)
2900         title = mobj.group('title')
2901         webpage = self._download_webpage(url, title)
2902         # We get the link to the free download page
2903         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2904         if m_download is None:
2905             raise ExtractorError(u'No free songs found')
2906
2907         download_link = m_download.group(1)
2908         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
2909                        webpage, re.MULTILINE|re.DOTALL).group('id')
2910
2911         download_webpage = self._download_webpage(download_link, id,
2912                                                   'Downloading free downloads page')
2913         # We get the dictionary of the track from some javascrip code
2914         info = re.search(r'items: (.*?),$',
2915                          download_webpage, re.MULTILINE).group(1)
2916         info = json.loads(info)[0]
2917         # We pick mp3-320 for now, until format selection can be easily implemented.
2918         mp3_info = info[u'downloads'][u'mp3-320']
2919         # If we try to use this url it says the link has expired
2920         initial_url = mp3_info[u'url']
2921         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2922         m_url = re.match(re_url, initial_url)
2923         #We build the url we will use to get the final track url
2924         # This url is build in Bandcamp in the script download_bunde_*.js
2925         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2926         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2927         # If we could correctly generate the .rand field the url would be
2928         #in the "download_url" key
2929         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2930
2931         track_info = {'id':id,
2932                       'title' : info[u'title'],
2933                       'ext' :   'mp3',
2934                       'url' :   final_url,
2935                       'thumbnail' : info[u'thumb_url'],
2936                       'uploader' :  info[u'artist']
2937                       }
2938
2939         return [track_info]
2940
2941 class RedTubeIE(InfoExtractor):
2942     """Information Extractor for redtube"""
2943     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2944
2945     def _real_extract(self,url):
2946         mobj = re.match(self._VALID_URL, url)
2947         if mobj is None:
2948             raise ExtractorError(u'Invalid URL: %s' % url)
2949
2950         video_id = mobj.group('id')
2951         video_extension = 'mp4'        
2952         webpage = self._download_webpage(url, video_id)
2953
2954         self.report_extraction(video_id)
2955
2956         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2957             webpage, u'video URL')
2958
2959         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2960             webpage, u'title')
2961
2962         return [{
2963             'id':       video_id,
2964             'url':      video_url,
2965             'ext':      video_extension,
2966             'title':    video_title,
2967         }]
2968         
2969 class InaIE(InfoExtractor):
2970     """Information Extractor for Ina.fr"""
2971     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2972
2973     def _real_extract(self,url):
2974         mobj = re.match(self._VALID_URL, url)
2975
2976         video_id = mobj.group('id')
2977         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2978         video_extension = 'mp4'
2979         webpage = self._download_webpage(mrss_url, video_id)
2980
2981         self.report_extraction(video_id)
2982
2983         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2984             webpage, u'video URL')
2985
2986         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2987             webpage, u'title')
2988
2989         return [{
2990             'id':       video_id,
2991             'url':      video_url,
2992             'ext':      video_extension,
2993             'title':    video_title,
2994         }]
2995
2996 class HowcastIE(InfoExtractor):
2997     """Information Extractor for Howcast.com"""
2998     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2999
3000     def _real_extract(self, url):
3001         mobj = re.match(self._VALID_URL, url)
3002
3003         video_id = mobj.group('id')
3004         webpage_url = 'http://www.howcast.com/videos/' + video_id
3005         webpage = self._download_webpage(webpage_url, video_id)
3006
3007         self.report_extraction(video_id)
3008
3009         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3010             webpage, u'video URL')
3011
3012         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3013             webpage, u'title')
3014
3015         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3016             webpage, u'description', fatal=False)
3017
3018         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3019             webpage, u'thumbnail', fatal=False)
3020
3021         return [{
3022             'id':       video_id,
3023             'url':      video_url,
3024             'ext':      'mp4',
3025             'title':    video_title,
3026             'description': video_description,
3027             'thumbnail': thumbnail,
3028         }]
3029
3030 class VineIE(InfoExtractor):
3031     """Information Extractor for Vine.co"""
3032     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3033
3034     def _real_extract(self, url):
3035         mobj = re.match(self._VALID_URL, url)
3036
3037         video_id = mobj.group('id')
3038         webpage_url = 'https://vine.co/v/' + video_id
3039         webpage = self._download_webpage(webpage_url, video_id)
3040
3041         self.report_extraction(video_id)
3042
3043         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3044             webpage, u'video URL')
3045
3046         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3047             webpage, u'title')
3048
3049         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3050             webpage, u'thumbnail', fatal=False)
3051
3052         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3053             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3054
3055         return [{
3056             'id':        video_id,
3057             'url':       video_url,
3058             'ext':       'mp4',
3059             'title':     video_title,
3060             'thumbnail': thumbnail,
3061             'uploader':  uploader,
3062         }]
3063
3064 class FlickrIE(InfoExtractor):
3065     """Information Extractor for Flickr videos"""
3066     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3067
3068     def _real_extract(self, url):
3069         mobj = re.match(self._VALID_URL, url)
3070
3071         video_id = mobj.group('id')
3072         video_uploader_id = mobj.group('uploader_id')
3073         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3074         webpage = self._download_webpage(webpage_url, video_id)
3075
3076         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3077
3078         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3079         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3080
3081         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3082             first_xml, u'node_id')
3083
3084         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3085         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3086
3087         self.report_extraction(video_id)
3088
3089         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3090         if mobj is None:
3091             raise ExtractorError(u'Unable to extract video url')
3092         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3093
3094         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3095             webpage, u'video title')
3096
3097         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3098             webpage, u'description', fatal=False)
3099
3100         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3101             webpage, u'thumbnail', fatal=False)
3102
3103         return [{
3104             'id':          video_id,
3105             'url':         video_url,
3106             'ext':         'mp4',
3107             'title':       video_title,
3108             'description': video_description,
3109             'thumbnail':   thumbnail,
3110             'uploader_id': video_uploader_id,
3111         }]
3112
3113 class TeamcocoIE(InfoExtractor):
3114     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3115
3116     def _real_extract(self, url):
3117         mobj = re.match(self._VALID_URL, url)
3118         if mobj is None:
3119             raise ExtractorError(u'Invalid URL: %s' % url)
3120         url_title = mobj.group('url_title')
3121         webpage = self._download_webpage(url, url_title)
3122
3123         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3124             webpage, u'video id')
3125
3126         self.report_extraction(video_id)
3127
3128         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3129             webpage, u'title')
3130
3131         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3132             webpage, u'thumbnail', fatal=False)
3133
3134         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3135             webpage, u'description', fatal=False)
3136
3137         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3138         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3139
3140         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3141             data, u'video URL')
3142
3143         return [{
3144             'id':          video_id,
3145             'url':         video_url,
3146             'ext':         'mp4',
3147             'title':       video_title,
3148             'thumbnail':   thumbnail,
3149             'description': video_description,
3150         }]
3151
3152 class XHamsterIE(InfoExtractor):
3153     """Information Extractor for xHamster"""
3154     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3155
3156     def _real_extract(self,url):
3157         mobj = re.match(self._VALID_URL, url)
3158
3159         video_id = mobj.group('id')
3160         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3161         webpage = self._download_webpage(mrss_url, video_id)
3162
3163         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3164         if mobj is None:
3165             raise ExtractorError(u'Unable to extract media URL')
3166         if len(mobj.group('server')) == 0:
3167             video_url = compat_urllib_parse.unquote(mobj.group('file'))
3168         else:
3169             video_url = mobj.group('server')+'/key='+mobj.group('file')
3170         video_extension = video_url.split('.')[-1]
3171
3172         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3173             webpage, u'title')
3174
3175         # Can't see the description anywhere in the UI
3176         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3177         #     webpage, u'description', fatal=False)
3178         # if video_description: video_description = unescapeHTML(video_description)
3179
3180         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3181         if mobj:
3182             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3183         else:
3184             video_upload_date = None
3185             self._downloader.report_warning(u'Unable to extract upload date')
3186
3187         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3188             webpage, u'uploader id', default=u'anonymous')
3189
3190         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3191             webpage, u'thumbnail', fatal=False)
3192
3193         return [{
3194             'id':       video_id,
3195             'url':      video_url,
3196             'ext':      video_extension,
3197             'title':    video_title,
3198             # 'description': video_description,
3199             'upload_date': video_upload_date,
3200             'uploader_id': video_uploader_id,
3201             'thumbnail': video_thumbnail
3202         }]
3203
3204 class HypemIE(InfoExtractor):
3205     """Information Extractor for hypem"""
3206     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3207
3208     def _real_extract(self, url):
3209         mobj = re.match(self._VALID_URL, url)
3210         if mobj is None:
3211             raise ExtractorError(u'Invalid URL: %s' % url)
3212         track_id = mobj.group(1)
3213
3214         data = { 'ax': 1, 'ts': time.time() }
3215         data_encoded = compat_urllib_parse.urlencode(data)
3216         complete_url = url + "?" + data_encoded
3217         request = compat_urllib_request.Request(complete_url)
3218         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3219         cookie = urlh.headers.get('Set-Cookie', '')
3220
3221         self.report_extraction(track_id)
3222
3223         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3224             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3225         try:
3226             track_list = json.loads(html_tracks)
3227             track = track_list[u'tracks'][0]
3228         except ValueError:
3229             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3230
3231         key = track[u"key"]
3232         track_id = track[u"id"]
3233         artist = track[u"artist"]
3234         title = track[u"song"]
3235
3236         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3237         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3238         request.add_header('cookie', cookie)
3239         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3240         try:
3241             song_data = json.loads(song_data_json)
3242         except ValueError:
3243             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3244         final_url = song_data[u"url"]
3245
3246         return [{
3247             'id':       track_id,
3248             'url':      final_url,
3249             'ext':      "mp3",
3250             'title':    title,
3251             'artist':   artist,
3252         }]
3253
3254 class Vbox7IE(InfoExtractor):
3255     """Information Extractor for Vbox7"""
3256     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3257
3258     def _real_extract(self,url):
3259         mobj = re.match(self._VALID_URL, url)
3260         if mobj is None:
3261             raise ExtractorError(u'Invalid URL: %s' % url)
3262         video_id = mobj.group(1)
3263
3264         redirect_page, urlh = self._download_webpage_handle(url, video_id)
3265         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3266         redirect_url = urlh.geturl() + new_location
3267         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3268
3269         title = self._html_search_regex(r'<title>(.*)</title>',
3270             webpage, u'title').split('/')[0].strip()
3271
3272         ext = "flv"
3273         info_url = "http://vbox7.com/play/magare.do"
3274         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3275         info_request = compat_urllib_request.Request(info_url, data)
3276         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3277         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3278         if info_response is None:
3279             raise ExtractorError(u'Unable to extract the media url')
3280         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3281
3282         return [{
3283             'id':        video_id,
3284             'url':       final_url,
3285             'ext':       ext,
3286             'title':     title,
3287             'thumbnail': thumbnail_url,
3288         }]
3289
3290 class GametrailersIE(InfoExtractor):
3291     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3292
3293     def _real_extract(self, url):
3294         mobj = re.match(self._VALID_URL, url)
3295         if mobj is None:
3296             raise ExtractorError(u'Invalid URL: %s' % url)
3297         video_id = mobj.group('id')
3298         video_type = mobj.group('type')
3299         webpage = self._download_webpage(url, video_id)
3300         if video_type == 'full-episodes':
3301             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3302         else:
3303             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3304         mgid = self._search_regex(mgid_re, webpage, u'mgid')
3305         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3306
3307         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3308                                            video_id, u'Downloading video info')
3309         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3310                                                video_id, u'Downloading video urls info')
3311
3312         self.report_extraction(video_id)
3313         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3314                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3315                       <image>.*
3316                         <url>(?P<thumb>.*?)</url>.*
3317                       </image>'''
3318
3319         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3320         if m_info is None:
3321             raise ExtractorError(u'Unable to extract video info')
3322         video_title = m_info.group('title')
3323         video_description = m_info.group('description')
3324         video_thumb = m_info.group('thumb')
3325
3326         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3327         if m_urls is None or len(m_urls) == 0:
3328             raise ExtractError(u'Unable to extrat video url')
3329         # They are sorted from worst to best quality
3330         video_url = m_urls[-1].group('url')
3331
3332         return {'url':         video_url,
3333                 'id':          video_id,
3334                 'title':       video_title,
3335                 # Videos are actually flv not mp4
3336                 'ext':         'flv',
3337                 'thumbnail':   video_thumb,
3338                 'description': video_description,
3339                 }
3340
3341 def gen_extractors():
3342     """ Return a list of an instance of every supported extractor.
3343     The order does matter; the first extractor matched is the one handling the URL.
3344     """
3345     return [
3346         YoutubePlaylistIE(),
3347         YoutubeChannelIE(),
3348         YoutubeUserIE(),
3349         YoutubeSearchIE(),
3350         YoutubeIE(),
3351         MetacafeIE(),
3352         DailymotionIE(),
3353         GoogleSearchIE(),
3354         PhotobucketIE(),
3355         YahooIE(),
3356         YahooSearchIE(),
3357         DepositFilesIE(),
3358         FacebookIE(),
3359         BlipTVIE(),
3360         BlipTVUserIE(),
3361         VimeoIE(),
3362         MyVideoIE(),
3363         ComedyCentralIE(),
3364         EscapistIE(),
3365         CollegeHumorIE(),
3366         XVideosIE(),
3367         SoundcloudSetIE(),
3368         SoundcloudIE(),
3369         InfoQIE(),
3370         MixcloudIE(),
3371         StanfordOpenClassroomIE(),
3372         MTVIE(),
3373         YoukuIE(),
3374         XNXXIE(),
3375         YouJizzIE(),
3376         PornotubeIE(),
3377         YouPornIE(),
3378         GooglePlusIE(),
3379         ArteTvIE(),
3380         NBAIE(),
3381         WorldStarHipHopIE(),
3382         JustinTVIE(),
3383         FunnyOrDieIE(),
3384         SteamIE(),
3385         UstreamIE(),
3386         RBMARadioIE(),
3387         EightTracksIE(),
3388         KeekIE(),
3389         TEDIE(),
3390         MySpassIE(),
3391         SpiegelIE(),
3392         LiveLeakIE(),
3393         ARDIE(),
3394         ZDFIE(),
3395         TumblrIE(),
3396         BandcampIE(),
3397         RedTubeIE(),
3398         InaIE(),
3399         HowcastIE(),
3400         VineIE(),
3401         FlickrIE(),
3402         TeamcocoIE(),
3403         XHamsterIE(),
3404         HypemIE(),
3405         Vbox7IE(),
3406         GametrailersIE(),
3407         StatigramIE(),
3408         GenericIE()
3409     ]
3410
3411 def get_info_extractor(ie_name):
3412     """Returns the info extractor class with the given ie_name"""
3413     return globals()[ie_name+'IE']