Merge pull request #1677 from rzhxeo/xtube

author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Sun, 3 Nov 2013 11:28:02 +0000 (03:28 -0800)

committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Sun, 3 Nov 2013 11:28:02 +0000 (03:28 -0800)
author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 3 Nov 2013 11:28:02 +0000 (03:28 -0800)
committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sun, 3 Nov 2013 11:28:02 +0000 (03:28 -0800)
diff --git a/setup.py b/setup.py

index f14f9637722e03aceb0aa96a9da1c65d91ef776e..aa7cfca0862b1f4ba2cfd220fd570ca63bcfda7e 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -11,6 +11,7 @@ try:
      setuptools_available = True
  except ImportError:
      from distutils.core import setup
+    setuptools_available = False
  
  try:
      # This will create an exe that needs Microsoft Visual C++ 2008
diff --git a/test/helper.py b/test/helper.py

index 777119ea5fa6fe7b43ae5efe53e9c7685be347b8..d7bf7a82802e58f0a80d788de83146d3a9d3fadf 100644 (file)
--- a/test/helper.py
+++ b/test/helper.py
@@ -5,9 +5,11 @@ import json
  import os.path
  import re
  import types
+import sys
  
  import youtube_dl.extractor
  from youtube_dl import YoutubeDL
+from youtube_dl.utils import preferredencoding
  
  
  def global_setup():
@@ -33,6 +35,21 @@ def try_rm(filename):
              raise
  
  
+def report_warning(message):
+    '''
+    Print the message to stderr, it will be prefixed with 'WARNING:'
+    If stderr is a tty file the 'WARNING:' will be colored
+    '''
+    if sys.stderr.isatty() and os.name != 'nt':
+        _msg_header = u'\033[0;33mWARNING:\033[0m'
+    else:
+        _msg_header = u'WARNING:'
+    output = u'%s %s\n' % (_msg_header, message)
+    if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3:
+        output = output.encode(preferredencoding())
+    sys.stderr.write(output)
+
+
  class FakeYDL(YoutubeDL):
      def __init__(self, override=None):
          # Different instances of the downloader can't share the same dictionary
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py

index f8cd1bdce9a64ac87ec83ab15130148f580a5291..58cf9c313607020d1493b420f8b93e18ccccd474 100644 (file)
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -62,10 +62,10 @@ class TestFormatSelection(unittest.TestCase):
  
      def test_format_limit(self):
          formats = [
-            {u'format_id': u'meh'},
-            {u'format_id': u'good'},
-            {u'format_id': u'great'},
-            {u'format_id': u'excellent'},
+            {u'format_id': u'meh', u'url': u'http://example.com/meh'},
+            {u'format_id': u'good', u'url': u'http://example.com/good'},
+            {u'format_id': u'great', u'url': u'http://example.com/great'},
+            {u'format_id': u'excellent', u'url': u'http://example.com/exc'},
          ]
          info_dict = {
              u'formats': formats, u'extractor': u'test', 'id': 'testvid'}
@@ -128,6 +128,18 @@ class TestFormatSelection(unittest.TestCase):
          downloaded = ydl.downloaded_info_dicts[0]
          self.assertEqual(downloaded['format_id'], u'35')
  
+    def test_add_extra_info(self):
+        test_dict = {
+            'extractor': 'Foo',
+        }
+        extra_info = {
+            'extractor': 'Bar',
+            'playlist': 'funny videos',
+        }
+        YDL.add_extra_info(test_dict, extra_info)
+        self.assertEqual(test_dict['extractor'], 'Foo')
+        self.assertEqual(test_dict['playlist'], 'funny videos')
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py

index c596415c4189624d254f72afd7b7bd7452d9fe50..ba3580ea419d513d8e92c97f813c236ce9d596e0 100644 (file)
--- a/test/test_dailymotion_subtitles.py
+++ b/test/test_dailymotion_subtitles.py
@@ -22,7 +22,7 @@ class TestDailymotionSubtitles(unittest.TestCase):
          return info_dict
      def getSubtitles(self):
          info_dict = self.getInfoDict()
-        return info_dict[0]['subtitles']
+        return info_dict['subtitles']
      def test_no_writesubtitles(self):
          subtitles = self.getSubtitles()
          self.assertEqual(subtitles, None)
diff --git a/test/test_download.py b/test/test_download.py

index b9a9be11d9686243ed2a1d5b748db4bc04712c54..73379beb1921b32e7f2b43b9865d85d71a618fc5 100644 (file)
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -6,7 +6,14 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import get_params, get_testcases, global_setup, try_rm, md5
+from test.helper import (
+    get_params,
+    get_testcases,
+    global_setup,
+    try_rm,
+    md5,
+    report_warning
+)
  global_setup()
  
  
@@ -19,6 +26,7 @@ import youtube_dl.YoutubeDL
  from youtube_dl.utils import (
      compat_str,
      compat_urllib_error,
+    compat_HTTPError,
      DownloadError,
      ExtractorError,
      UnavailableVideoError,
@@ -60,9 +68,12 @@ def generator(test_case):
          if not ie._WORKING:
              print_skipping('IE marked as not _WORKING')
              return
-        if 'playlist' not in test_case and not test_case['file']:
-            print_skipping('No output file specified')
-            return
+        if 'playlist' not in test_case:
+            info_dict = test_case.get('info_dict', {})
+            if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
+                print_skipping('The output file cannot be know, the "file" '
+                    'key is missing or the info_dict is incomplete')
+                return
          if 'skip' in test_case:
              print_skipping(test_case['skip'])
              return
@@ -77,35 +88,47 @@ def generator(test_case):
                  finished_hook_called.add(status['filename'])
          ydl.fd.add_progress_hook(_hook)
  
+        def get_tc_filename(tc):
+            return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {}))
+
          test_cases = test_case.get('playlist', [test_case])
-        for tc in test_cases:
-            try_rm(tc['file'])
-            try_rm(tc['file'] + '.part')
-            try_rm(tc['file'] + '.info.json')
+        def try_rm_tcs_files():
+            for tc in test_cases:
+                tc_filename = get_tc_filename(tc)
+                try_rm(tc_filename)
+                try_rm(tc_filename + '.part')
+                try_rm(tc_filename + '.info.json')
+        try_rm_tcs_files()
          try:
-            for retry in range(1, RETRIES + 1):
+            try_num = 1
+            while True:
                  try:
                      ydl.download([test_case['url']])
                  except (DownloadError, ExtractorError) as err:
-                    if retry == RETRIES: raise
-
                      # Check if the exception is not a network related one
-                    if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
+                    if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
                          raise
  
-                    print('Retrying: {0} failed tries\n\n##########\n\n'.format(retry))
+                    if try_num == RETRIES:
+                        report_warning(u'Failed due to network errors, skipping...')
+                        return
+
+                    print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num))
+
+                    try_num += 1
                  else:
                      break
  
              for tc in test_cases:
+                tc_filename = get_tc_filename(tc)
                  if not test_case.get('params', {}).get('skip_download', False):
-                    self.assertTrue(os.path.exists(tc['file']), msg='Missing file ' + tc['file'])
-                    self.assertTrue(tc['file'] in finished_hook_called)
-                self.assertTrue(os.path.exists(tc['file'] + '.info.json'))
+                    self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
+                    self.assertTrue(tc_filename in finished_hook_called)
+                self.assertTrue(os.path.exists(tc_filename + '.info.json'))
                  if 'md5' in tc:
-                    md5_for_file = _file_md5(tc['file'])
+                    md5_for_file = _file_md5(tc_filename)
                      self.assertEqual(md5_for_file, tc['md5'])
-                with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof:
+                with io.open(tc_filename + '.info.json', encoding='utf-8') as infof:
                      info_dict = json.load(infof)
                  for (info_field, expected) in tc.get('info_dict', {}).items():
                      if isinstance(expected, compat_str) and expected.startswith('md5:'):
@@ -125,11 +148,11 @@ def generator(test_case):
                  # Check for the presence of mandatory fields
                  for key in ('id', 'url', 'title', 'ext'):
                      self.assertTrue(key in info_dict.keys() and info_dict[key])
+                # Check for mandatory fields that are automatically set by YoutubeDL
+                for key in ['webpage_url', 'extractor', 'extractor_key']:
+                    self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)
          finally:
-            for tc in test_cases:
-                try_rm(tc['file'])
-                try_rm(tc['file'] + '.part')
-                try_rm(tc['file'] + '.info.json')
+            try_rm_tcs_files()
  
      return test_template
  
diff --git a/test/test_playlists.py b/test/test_playlists.py

index d6a8d56df99609e50ea5885d2f5a3eb48b72cf37..de1e8d88edc647806e53a17574ad4415f09563cc 100644 (file)
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@@ -20,6 +20,7 @@ from youtube_dl.extractor import (
      SoundcloudUserIE,
      LivestreamIE,
      NHLVideocenterIE,
+    BambuserChannelIE,
  )
  
  
@@ -85,5 +86,13 @@ class TestPlaylists(unittest.TestCase):
          self.assertEqual(result['title'], u'Highlights')
          self.assertEqual(len(result['entries']), 12)
  
+    def test_bambuser_channel(self):
+        dl = FakeYDL()
+        ie = BambuserChannelIE(dl)
+        result = ie.extract('http://bambuser.com/channel/pixelversity')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'pixelversity')
+        self.assertTrue(len(result['entries']) >= 66)
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 3132958397847a72d569f19b0b3887dfa3a540cd..86a6fd043ea0b1e0cb65764d18a28a71d19bb6da 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -272,7 +272,7 @@ class YoutubeDL(object):
                  autonumber_size = 5
              autonumber_templ = u'%0' + str(autonumber_size) + u'd'
              template_dict['autonumber'] = autonumber_templ % self._num_downloads
-            if template_dict['playlist_index'] is not None:
+            if template_dict.get('playlist_index') is not None:
                  template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
  
              sanitize = lambda k, v: sanitize_filename(
@@ -318,6 +318,12 @@ class YoutubeDL(object):
                      % info_dict)
          return None
  
+    @staticmethod
+    def add_extra_info(info_dict, extra_info):
+        '''Set the keys from extra_info in info dict if they are missing'''
+        for key, value in extra_info.items():
+            info_dict.setdefault(key, value)
+
      def extract_info(self, url, download=True, ie_key=None, extra_info={}):
          '''
          Returns a list with a dictionary for each video we find.
@@ -344,17 +350,17 @@ class YoutubeDL(object):
                      break
                  if isinstance(ie_result, list):
                      # Backwards compatibility: old IE result format
-                    for result in ie_result:
-                        result.update(extra_info)
                      ie_result = {
                          '_type': 'compat_list',
                          'entries': ie_result,
                      }
-                else:
-                    ie_result.update(extra_info)
-                if 'extractor' not in ie_result:
-                    ie_result['extractor'] = ie.IE_NAME
-                return self.process_ie_result(ie_result, download=download)
+                self.add_extra_info(ie_result,
+                    {
+                        'extractor': ie.IE_NAME,
+                        'webpage_url': url,
+                        'extractor_key': ie.ie_key(),
+                    })
+                return self.process_ie_result(ie_result, download, extra_info)
              except ExtractorError as de: # An error we somewhat expected
                  self.report_error(compat_str(de), de.format_traceback())
                  break
@@ -378,7 +384,7 @@ class YoutubeDL(object):
  
          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
          if result_type == 'video':
-            ie_result.update(extra_info)
+            self.add_extra_info(ie_result, extra_info)
              return self.process_video_result(ie_result)
          elif result_type == 'url':
              # We have to add extra_info to the results because it may be
@@ -388,6 +394,7 @@ class YoutubeDL(object):
                                       ie_key=ie_result.get('ie_key'),
                                       extra_info=extra_info)
          elif result_type == 'playlist':
+            self.add_extra_info(ie_result, extra_info)
              # We process each entry in the playlist
              playlist = ie_result.get('title', None) or ie_result.get('id', None)
              self.to_screen(u'[download] Downloading playlist: %s' % playlist)
@@ -413,12 +420,10 @@ class YoutubeDL(object):
                  extra = {
                      'playlist': playlist,
                      'playlist_index': i + playliststart,
+                    'extractor': ie_result['extractor'],
+                    'webpage_url': ie_result['webpage_url'],
+                    'extractor_key': ie_result['extractor_key'],
                  }
-                if not 'extractor' in entry:
-                    # We set the extractor, if it's an url it will be set then to
-                    # the new extractor, but if it's already a video we must make
-                    # sure it's present: see issue #877
-                    entry['extractor'] = ie_result['extractor']
                  entry_result = self.process_ie_result(entry,
                                                        download=download,
                                                        extra_info=extra)
@@ -427,10 +432,15 @@ class YoutubeDL(object):
              return ie_result
          elif result_type == 'compat_list':
              def _fixup(r):
-                r.setdefault('extractor', ie_result['extractor'])
+                self.add_extra_info(r,
+                    {
+                        'extractor': ie_result['extractor'],
+                        'webpage_url': ie_result['webpage_url'],
+                        'extractor_key': ie_result['extractor_key'],
+                    })
                  return r
              ie_result['entries'] = [
-                self.process_ie_result(_fixup(r), download=download)
+                self.process_ie_result(_fixup(r), download, extra_info)
                  for r in ie_result['entries']
              ]
              return ie_result
@@ -482,7 +492,7 @@ class YoutubeDL(object):
                  format['format'] = u'{id} - {res}{note}'.format(
                      id=format['format_id'],
                      res=self.format_resolution(format),
-                    note=u' ({})'.format(format['format_note']) if format.get('format_note') is not None else '',
+                    note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
                  )
              # Automatically determine file extension if missing
              if 'ext' not in format:
@@ -759,6 +769,8 @@ class YoutubeDL(object):
  
      @staticmethod
      def format_resolution(format, default='unknown'):
+        if format.get('_resolution') is not None:
+            return format['_resolution']
          if format.get('height') is not None:
              if format.get('width') is not None:
                  res = u'%sx%s' % (format['width'], format['height'])
@@ -769,19 +781,23 @@ class YoutubeDL(object):
          return res
  
      def list_formats(self, info_dict):
-        formats_s = []
-        for format in info_dict.get('formats', [info_dict]):
-            formats_s.append(u'%-15s%-7s     %-15s%s' % (
+        def line(format):
+            return (u'%-15s%-10s%-12s%s' % (
                  format['format_id'],
                  format['ext'],
-                format.get('format_note', ''),
                  self.format_resolution(format),
+                format.get('format_note', ''),
                  )
              )
-        if len(formats_s) != 1:
-            formats_s[0] += ' (worst)'
-            formats_s[-1] += ' (best)'
-        formats_s = "\n".join(formats_s)
-        self.to_screen(u'[info] Available formats for %s:\n'
-            u'format code    extension   note           resolution\n%s' % (
-                info_dict['id'], formats_s))
+
+        formats = info_dict.get('formats', [info_dict])
+        formats_s = list(map(line, formats))
+        if len(formats) > 1:
+            formats_s[0] += (' ' if formats[0].get('format_note') else '') + '(worst)'
+            formats_s[-1] += (' ' if formats[-1].get('format_note') else '') + '(best)'
+
+        header_line = line({
+            'format_id': u'format code', 'ext': u'extension',
+            '_resolution': u'resolution', 'format_note': u'note'})
+        self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
+                       (info_dict['id'], header_line, u"\n".join(formats_s)))
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 7efd097e4d24a31b523570e074047b4bccb944c0..8dad38a007dfcab9767b7f790d8f1af794d44580 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -9,6 +9,7 @@ from .arte import (
      ArteTVFutureIE,
  )
  from .auengine import AUEngineIE
+from .bambuser import BambuserIE, BambuserChannelIE
  from .bandcamp import BandcampIE
  from .bliptv import BlipTVIE, BlipTVUserIE
  from .bloomberg import BloombergIE
@@ -39,6 +40,7 @@ from .ehow import EHowIE
  from .eighttracks import EightTracksIE
  from .escapist import EscapistIE
  from .exfm import ExfmIE
+from .extremetube import ExtremeTubeIE
  from .facebook import FacebookIE
  from .faz import FazIE
  from .fktv import (
@@ -83,6 +85,7 @@ from .mit import TechTVMITIE, MITIE
  from .mixcloud import MixcloudIE
  from .mtv import MTVIE
  from .muzu import MuzuTVIE
+from .myspace import MySpaceIE
  from .myspass import MySpassIE
  from .myvideo import MyVideoIE
  from .naver import NaverIE
@@ -141,6 +144,7 @@ from .videofyme import VideofyMeIE
  from .videopremium import VideoPremiumIE
  from .vimeo import VimeoIE, VimeoChannelIE
  from .vine import VineIE
+from .vk import VKIE
  from .wat import WatIE
  from .websurg import WeBSurgIE
  from .weibo import WeiboIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index d39b489518f7bc699854d9ddeb7cd4fa357a6c2b..e10c74c112a0b7bbb9335e76cdff88524c4cbafb 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -158,7 +158,9 @@ class ArteTVPlus7IE(InfoExtractor):
              'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
          }
  
-        formats = player_info['VSR'].values()
+        all_formats = player_info['VSR'].values()
+        # Some formats use the m3u8 protocol
+        all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats))
          def _match_lang(f):
              if f.get('versionCode') is None:
                  return True
@@ -170,11 +172,16 @@ class ArteTVPlus7IE(InfoExtractor):
              regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
              return any(re.match(r, f['versionCode']) for r in regexes)
          # Some formats may not be in the same language as the url
-        formats = filter(_match_lang, formats)
-        # Some formats use the m3u8 protocol
-        formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats)
-        # We order the formats by quality
+        formats = filter(_match_lang, all_formats)
          formats = list(formats) # in python3 filter returns an iterator
+        if not formats:
+            # Some videos are only available in the 'Originalversion'
+            # they aren't tagged as being in French or German
+            if all(f['versionCode'] == 'VO' for f in all_formats):
+                formats = all_formats
+            else:
+                raise ExtractorError(u'The formats list is empty')
+        # We order the formats by quality
          if re.match(r'[A-Z]Q', formats[0]['quality']) is not None:
              sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
          else:
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py

new file mode 100644 (file)

index 0000000..f3b36f4
--- /dev/null
+++ b/youtube_dl/extractor/bambuser.py
@@ -0,0 +1,80 @@
+import re
+import json
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+)
+
+
+class BambuserIE(InfoExtractor):
+    IE_NAME = u'bambuser'
+    _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
+    _API_KEY = '005f64509e19a868399060af746a00aa'
+
+    _TEST = {
+        u'url': u'http://bambuser.com/v/4050584',
+        u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
+        u'info_dict': {
+            u'id': u'4050584',
+            u'ext': u'flv',
+            u'title': u'Education engineering days - lightning talks',
+            u'duration': 3741,
+            u'uploader': u'pixelversity',
+            u'uploader_id': u'344706',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info_url = ('http://player-c.api.bambuser.com/getVideo.json?'
+            '&api_key=%s&vid=%s' % (self._API_KEY, video_id))
+        info_json = self._download_webpage(info_url, video_id)
+        info = json.loads(info_json)['result']
+
+        return {
+            'id': video_id,
+            'title': info['title'],
+            'url': info['url'],
+            'thumbnail': info.get('preview'),
+            'duration': int(info['length']),
+            'view_count': int(info['views_total']),
+            'uploader': info['username'],
+            'uploader_id': info['uid'],
+        }
+
+
+class BambuserChannelIE(InfoExtractor):
+    IE_NAME = u'bambuser:channel'
+    _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
+    # The maximum number we can get with each request
+    _STEP = 50
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user = mobj.group('user')
+        urls = []
+        last_id = ''
+        for i in itertools.count(1):
+            req_url = ('http://bambuser.com/xhr-api/index.php?username={user}'
+                '&sort=created&access_mode=0%2C1%2C2&limit={count}'
+                '&method=broadcast&format=json&vid_older_than={last}'
+                ).format(user=user, count=self._STEP, last=last_id)
+            req = compat_urllib_request.Request(req_url)
+            # Without setting this header, we wouldn't get any result
+            req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
+            info_json = self._download_webpage(req, user,
+                u'Downloading page %d' % i)
+            results = json.loads(info_json)['result']
+            if len(results) == 0:
+                break
+            last_id = results[-1]['vid']
+            urls.extend(self.url_result(v['page'], 'Bambuser') for v in results)
+
+        return {
+            '_type': 'playlist',
+            'title': user,
+            'entries': urls,
+        }
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index 1392f382a24c273604f0c67db7afafefbcec85b8..0d9b87a34c7adf1cebe045c70c298a394612e6db 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -23,7 +23,7 @@ class BrightcoveIE(InfoExtractor):
              # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
              u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
              u'file': u'2371591881001.mp4',
-            u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+            u'md5': u'8eccab865181d29ec2958f32a6a754f5',
              u'note': u'Test Brightcove downloads and detection in GenericIE',
              u'info_dict': {
                  u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
@@ -122,12 +122,10 @@ class BrightcoveIE(InfoExtractor):
              best_format = renditions[-1]
              info.update({
                  'url': best_format['defaultURL'],
-                'ext': 'mp4',
              })
          elif video_info.get('FLVFullLengthURL') is not None:
              info.update({
                  'url': video_info['FLVFullLengthURL'],
-                'ext': 'flv',
              })
          else:
              raise ExtractorError(u'Unable to extract video url for %s' % info['id'])
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index ce349fe207af21690c95b86dd7a293220ac7025e..e0ccba533534709b6ce65af518ca16fadee0c4ea 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -63,7 +63,7 @@ class InfoExtractor(object):
                      * ext       Will be calculated from url if missing
                      * format    A human-readable description of the format
                                  ("mp4 container with h264/opus").
-                                Calculated from the format_id, width, height 
+                                Calculated from the format_id, width, height.
                                  and format_note fields if missing.
                      * format_id A short description of the format
                                  ("mp4_h264_opus" or "19")
@@ -71,6 +71,9 @@ class InfoExtractor(object):
                                  ("3D" or "DASH video")
                      * width     Width of the video, if known
                      * height    Height of the video, if known
+    webpage_url:    The url to the video webpage, if given to youtube-dl it
+                    should allow to get the same result again. (It will be set
+                    by YoutubeDL if it's missing)
  
      Unless mentioned otherwise, the fields should be Unicode strings.
  
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

index 4c0488245cd60df341df74110a434ed554fa304e..355b4ed0a028540c81a486ac4f389d50cbf48987 100644 (file)
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -21,6 +21,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
          """Build a request with the family filter disabled"""
          request = compat_urllib_request.Request(url)
          request.add_header('Cookie', 'family_filter=off')
+        request.add_header('Cookie', 'ff=off')
          return request
  
  class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
@@ -61,6 +62,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
              },
              u'skip': u'VEVO is only available in some countries',
          },
+        # age-restricted video
+        {
+            u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+            u'file': u'xyh2zz.mp4',
+            u'md5': u'0d667a7b9cebecc3c89ee93099c4159d',
+            u'info_dict': {
+                u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+                u'uploader': 'HotWaves1012',
+                u'age_limit': 18,
+            }
+
+        }
      ]
  
      def _real_extract(self, url):
@@ -90,7 +103,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
          video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
                                               # Looking for official user
                                               r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
-                                            webpage, 'video uploader')
+                                            webpage, 'video uploader', fatal=False)
+        age_limit = self._rta_search(webpage)
  
          video_upload_date = None
          mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
@@ -132,15 +146,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
              self._list_available_subtitles(video_id)
              return
  
-        return [{
+        return {
              'id':       video_id,
              'formats': formats,
              'uploader': video_uploader,
              'upload_date':  video_upload_date,
              'title':    self._og_search_title(webpage),
              'subtitles':    video_subtitles,
-            'thumbnail': info['thumbnail_url']
-        }]
+            'thumbnail': info['thumbnail_url'],
+            'age_limit': age_limit,
+        }
  
      def _get_available_subtitles(self, video_id):
          try:
diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py

index c7455657907f7c3a67e4544b7e25819c29042dc5..a51d79b08c656144c3f67d853fcae8fe52bc6e1f 100644 (file)
--- a/youtube_dl/extractor/exfm.py
+++ b/youtube_dl/extractor/exfm.py
@@ -21,6 +21,7 @@ class ExfmIE(InfoExtractor):
                  u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',
              },
              u'note': u'Soundcloud song',
+            u'skip': u'The site is down too often',
          },
          {
              u'url': u'http://ex.fm/song/wddt8',
@@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor):
                  u'title': u'Safe and Sound',
                  u'uploader': u'Capital Cities',
              },
+            u'skip': u'The site is down too often',
          },
      ]
  
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py

new file mode 100644 (file)

index 0000000..0f1eec4
--- /dev/null
+++ b/youtube_dl/extractor/extremetube.py
@@ -0,0 +1,50 @@
+import os
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+
+class ExtremeTubeIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+    _TEST = {
+        u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+        u'file': u'652431.mp4',
+        u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0',
+        u'info_dict': {
+            u"title": u"Music Video 14 british euro brit european cumshots swallow",
+            u"uploader": u"unknown",
+            u"age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        url = 'http://www.' + mobj.group('url')
+
+        req = compat_urllib_request.Request(url)
+        req.add_header('Cookie', 'age_verified=1')
+        webpage = self._download_webpage(req, video_id)
+
+        video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title')
+        uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False)
+        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
+        path = compat_urllib_parse_urlparse( video_url ).path
+        extension = os.path.splitext( path )[1][1:]
+        format = path.split('/')[5].split('_')[:2]
+        format = "-".join( format )
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'uploader': uploader,
+            'url': video_url,
+            'ext': extension,
+            'format': format,
+            'format_id': format,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py

index 5e05900da211eb9d2d6e33706a71f47dcf1abff7..7869244452ac4b8dab673ce82e29b6b737aa3bc2 100644 (file)
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -12,7 +12,7 @@ from ..aes import (
  )
  
  class KeezMoviesIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
      _TEST = {
          u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
          u'file': u'1214711.mp4',
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py

index d04da98c89ed582e83e8bb905b15ff04c78d3018..4531fd6ab23a958d3d4dc4e38d52e7f330d85196 100644 (file)
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -40,13 +40,9 @@ class LivestreamIE(InfoExtractor):
  
          if video_id is None:
              # This is an event page:
-            player = get_meta_content('twitter:player', webpage)
-            if player is None:
-                raise ExtractorError('Couldn\'t extract event api url')
-            api_url = player.replace('/player', '')
-            api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url)
-            info = json.loads(self._download_webpage(api_url, event_name,
-                                                     u'Downloading event info'))
+            config_json = self._search_regex(r'window.config = ({.*?});',
+                webpage, u'window config')
+            info = json.loads(config_json)['event']
              videos = [self._extract_video_info(video_data['data'])
                  for video_data in info['feed']['data'] if video_data['type'] == u'video']
              return self.playlist_result(videos, info['id'], info['full_name'])
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py

index 234b9e80f3bf3fac48ff77f4c8b1ac9da7de2c7e..91480ba875d5fff781ce08a47c41a3824e94e910 100644 (file)
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -20,7 +20,9 @@ class MetacafeIE(InfoExtractor):
      _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
      _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
      IE_NAME = u'metacafe'
-    _TESTS = [{
+    _TESTS = [
+    # Youtube video
+    {
          u"add_ie": ["Youtube"],
          u"url":  u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
          u"file":  u"_aUehQsCQtM.mp4",
@@ -32,15 +34,42 @@ class MetacafeIE(InfoExtractor):
              u"uploader_id": u"PBS"
          }
      },
+    # Normal metacafe video
+    {
+        u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
+        u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad',
+        u'info_dict': {
+            u'id': u'11121940',
+            u'ext': u'mp4',
+            u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4',
+            u'uploader': u'ign',
+            u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+        },
+    },
+    # AnyClip video
      {
          u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
          u"file": u"an-dVVXnuY7Jh77J.mp4",
          u"info_dict": {
              u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
              u"uploader": u"anyclip",
-            u"description": u"md5:38c711dd98f5bb87acf973d573442e67"
-        }
-    }]
+            u"description": u"md5:38c711dd98f5bb87acf973d573442e67",
+        },
+    },
+    # age-restricted video
+    {
+        u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
+        u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09',
+        u'info_dict': {
+            u'id': u'5186653',
+            u'ext': u'mp4',
+            u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
+            u'uploader': u'Dwayne Pipe',
+            u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b',
+            u'age_limit': 18,
+        },
+    },
+    ]
  
  
      def report_disclaimer(self):
@@ -62,6 +91,7 @@ class MetacafeIE(InfoExtractor):
              'submit': "Continue - I'm over 18",
              }
          request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
          try:
              self.report_age_confirmation()
              compat_urllib_request.urlopen(request).read()
@@ -83,7 +113,12 @@ class MetacafeIE(InfoExtractor):
  
          # Retrieve video webpage to extract further information
          req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
-        req.headers['Cookie'] = 'flashVersion=0;'
+
+        # AnyClip videos require the flashversion cookie so that we get the link
+        # to the mp4 file
+        mobj_an = re.match(r'^an-(.*?)$', video_id)
+        if mobj_an:
+            req.headers['Cookie'] = 'flashVersion=0;'
          webpage = self._download_webpage(req, video_id)
  
          # Extract URL, uploader and title from webpage
@@ -125,6 +160,11 @@ class MetacafeIE(InfoExtractor):
                  r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
                  webpage, u'uploader nickname', fatal=False)
  
+        if re.search(r'"contentRating":"restricted"', webpage) is not None:
+            age_limit = 18
+        else:
+            age_limit = 0
+
          return {
              '_type':    'video',
              'id':       video_id,
@@ -134,4 +174,5 @@ class MetacafeIE(InfoExtractor):
              'upload_date':  None,
              'title':    video_title,
              'ext':      video_ext,
+            'age_limit': age_limit,
          }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py

index e520e2bb491f2c55f3867ab214b2b949eca6e684..e96d3952cc79ebe8294302d9795672b7a574590e 100644 (file)
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -80,6 +80,8 @@ class MTVIE(InfoExtractor):
          video_id = self._id_from_uri(uri)
          self.report_extraction(video_id)
          mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+        # Remove the templates, like &device={device}
+        mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url)
          if 'acceptMethods' not in mediagen_url:
              mediagen_url += '&acceptMethods=fms'
          mediagen_page = self._download_webpage(mediagen_url, video_id,
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py

new file mode 100644 (file)

index 0000000..050f54a
--- /dev/null
+++ b/youtube_dl/extractor/myspace.py
@@ -0,0 +1,48 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_str,
+)
+
+
+class MySpaceIE(InfoExtractor):
+    _VALID_URL = r'https?://myspace\.com/([^/]+)/video/[^/]+/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'https://myspace.com/coldplay/video/viva-la-vida/100008689',
+        u'info_dict': {
+            u'id': u'100008689',
+            u'ext': u'flv',
+            u'title': u'Viva La Vida',
+            u'description': u'The official Viva La Vida video, directed by Hype Williams',
+            u'uploader': u'Coldplay',
+            u'uploader_id': u'coldplay',
+        },
+        u'params': {
+            # rtmp download
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        context = json.loads(self._search_regex(r'context = ({.*?});', webpage,
+            u'context'))
+        video = context['video']
+        rtmp_url, play_path = video['streamUrl'].split(';', 1)
+
+        return {
+            'id': compat_str(video['mediaId']),
+            'title': video['title'],
+            'url': rtmp_url,
+            'play_path': play_path,
+            'ext': 'flv',
+            'description': video['description'],
+            'thumbnail': video['imageUrl'],
+            'uploader': video['artistName'],
+            'uploader_id': video['artistUsername'],
+        }
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

index 1c1cc418d29a8897e2a2825492ed7becab75af6b..3f6020f74ec9eeefbddafc184d3f48cf5e436adb 100644 (file)
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -5,7 +5,7 @@ import datetime
  
  from .common import InfoExtractor
  from ..utils import (
-    determine_ext,
+    compat_HTTPError,
      ExtractorError,
  )
  
@@ -16,26 +16,22 @@ class VevoIE(InfoExtractor):
      (currently used by MTVIE)
      """
      _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)'
-    _TEST = {
+    _TESTS = [{
          u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
          u'file': u'GB1101300280.mp4',
+        u"md5": u"06bea460acb744eab74a9d7dcb4bfd61",
          u'info_dict': {
              u"upload_date": u"20130624",
              u"uploader": u"Hurts",
              u"title": u"Somebody to Die For",
-            u'duration': 230,
+            u"duration": 230,
+            u"width": 1920,
+            u"height": 1080,
          }
-    }
+    }]
+    _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
  
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
-        info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
-
-        self.report_extraction(video_id)
-        video_info = json.loads(info_json)['video']
+    def _formats_from_json(self, video_info):
          last_version = {'version': -1}
          for version in video_info['videoVersions']:
              # These are the HTTP downloads, other types are for different manifests
@@ -50,17 +46,74 @@ class VevoIE(InfoExtractor):
          # Already sorted from worst to best quality
          for rend in renditions.findall('rendition'):
              attr = rend.attrib
-            f_url = attr['url']
+            format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr
              formats.append({
-                'url': f_url,
-                'ext': determine_ext(f_url),
+                'url': attr['url'],
+                'format_id': attr['name'],
+                'format_note': format_note,
                  'height': int(attr['frameheight']),
                  'width': int(attr['frameWidth']),
              })
+        return formats
+
+    def _formats_from_smil(self, smil_xml):
+        formats = []
+        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
+        els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
+        for el in els:
+            src = el.attrib['src']
+            m = re.match(r'''(?xi)
+                (?P<ext>[a-z0-9]+):
+                (?P<path>
+                    [/a-z0-9]+     # The directory and main part of the URL
+                    _(?P<cbr>[0-9]+)k
+                    _(?P<width>[0-9]+)x(?P<height>[0-9]+)
+                    _(?P<vcodec>[a-z0-9]+)
+                    _(?P<vbr>[0-9]+)
+                    _(?P<acodec>[a-z0-9]+)
+                    _(?P<abr>[0-9]+)
+                    \.[a-z0-9]+  # File extension
+                )''', src)
+            if not m:
+                continue
  
-        date_epoch = int(self._search_regex(
-            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000
-        upload_date = datetime.datetime.fromtimestamp(date_epoch)
+            format_url = self._SMIL_BASE_URL + m.group('path')
+            format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' %
+                           m.groupdict())
+            formats.append({
+                'url': format_url,
+                'format_id': u'SMIL_' + m.group('cbr'),
+                'format_note': format_note,
+                'ext': m.group('ext'),
+                'width': int(m.group('width')),
+                'height': int(m.group('height')),
+            })
+        return formats
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
+        info_json = self._download_webpage(json_url, video_id, u'Downloading json info')
+        video_info = json.loads(info_json)['video']
+
+        formats = self._formats_from_json(video_info)
+        try:
+            smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
+                self._SMIL_BASE_URL, video_id, video_id.lower())
+            smil_xml = self._download_webpage(smil_url, video_id,
+                                              u'Downloading SMIL info')
+            formats.extend(self._formats_from_smil(smil_xml))
+        except ExtractorError as ee:
+            if not isinstance(ee.cause, compat_HTTPError):
+                raise
+            self._downloader.report_warning(
+                u'Cannot download SMIL information, falling back to JSON ..')
+
+        timestamp_ms = int(self._search_regex(
+            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))
+        upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
          info = {
              'id': video_id,
              'title': video_info['title'],
@@ -71,7 +124,4 @@ class VevoIE(InfoExtractor):
              'duration': video_info['duration'],
          }
  
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
          return info
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index b4dbcd2eebec27b19e9c91ff7ee81d20208d62fc..62273fd33814901ca3b3c799e0589e6c14985fc3 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -20,14 +20,14 @@ class VimeoIE(InfoExtractor):
      """Information extractor for vimeo.com."""
  
      # _VALID_URL matches Vimeo URLs
-    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
+    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'
      _NETRC_MACHINE = 'vimeo'
      IE_NAME = u'vimeo'
      _TESTS = [
          {
              u'url': u'http://vimeo.com/56015672#at=0',
              u'file': u'56015672.mp4',
-            u'md5': u'ae7a1d8b183758a0506b0622f37dfa14',
+            u'md5': u'8879b6cc097e987f02484baf890129e5',
              u'info_dict': {
                  u"upload_date": u"20121220", 
                  u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", 
@@ -128,11 +128,9 @@ class VimeoIE(InfoExtractor):
              raise ExtractorError(u'Invalid URL: %s' % url)
  
          video_id = mobj.group('id')
-        if not mobj.group('proto'):
-            url = 'https://' + url
-        elif mobj.group('pro'):
+        if mobj.group('pro') or mobj.group('player'):
              url = 'http://player.vimeo.com/video/' + video_id
-        elif mobj.group('direct_link'):
+        else:
              url = 'https://vimeo.com/' + video_id
  
          # Retrieve video webpage to extract further information
@@ -234,7 +232,7 @@ class VimeoIE(InfoExtractor):
          if len(formats) == 0:
              raise ExtractorError(u'No known codec found')
  
-        return [{
+        return {
              'id':       video_id,
              'uploader': video_uploader,
              'uploader_id': video_uploader_id,
@@ -243,7 +241,8 @@ class VimeoIE(InfoExtractor):
              'thumbnail':    video_thumbnail,
              'description':  video_description,
              'formats': formats,
-        }]
+            'webpage_url': url,
+        }
  
  
  class VimeoChannelIE(InfoExtractor):
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py

new file mode 100644 (file)

index 0000000..90d8a6d
--- /dev/null
+++ b/youtube_dl/extractor/vk.py
@@ -0,0 +1,45 @@
+# encoding: utf-8
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_str,
+    unescapeHTML,
+)
+
+
+class VKIE(InfoExtractor):
+    IE_NAME = u'vk.com'
+    _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
+
+    _TEST = {
+        u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
+        u'md5': u'0deae91935c54e00003c2a00646315f0',
+        u'info_dict': {
+            u'id': u'162222515',
+            u'ext': u'flv',
+            u'title': u'ProtivoGunz - Хуёвая песня',
+            u'uploader': u'Noize MC',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
+        info_page = self._download_webpage(info_url, video_id)
+        m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
+        if m_yt is not None:
+            self.to_screen(u'Youtube video detected')
+            return self.url_result(m_yt.group(1), 'Youtube')
+        vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars')
+        vars = json.loads(vars_json)
+
+        return {
+            'id': compat_str(vars['vid']),
+            'url': vars['url240'],
+            'title': unescapeHTML(vars['md_title']),
+            'thumbnail': vars['jpg'],
+            'uploader': vars['md_author'],
+        }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index d05d0a8c13cc65034723036cef07e5ac6ca899f3..6ddd6ef06791d52de1245dc1d1744f32be4d96f5 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -74,14 +74,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
              return False
  
-        galx = None
-        dsh = None
-        match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
-        if match:
-          galx = match.group(1)
-        match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
-        if match:
-          dsh = match.group(1)
+        galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
+                                  login_page, u'Login GALX parameter')
  
          # Log in
          login_form_strs = {
@@ -95,7 +89,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                  u'checkConnection': u'',
                  u'checkedDomains': u'youtube',
                  u'dnConn': u'',
-                u'dsh': dsh,
                  u'pstMsg': u'0',
                  u'rmShown': u'1',
                  u'secTok': u'',
@@ -346,18 +339,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
              }
          },
-        {
-            u"url":  u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
-            u"file":  u"1ltcDfZMA3U.mp4",
-            u"note": u"Test VEVO video (#897)",
-            u"info_dict": {
-                u"upload_date": u"20070518",
-                u"title": u"Maps - It Will Find You",
-                u"description": u"Music video by Maps performing It Will Find You.",
-                u"uploader": u"MuteUSA",
-                u"uploader_id": u"MuteUSA"
-            }
-        },
          {
              u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",
              u"file":  u"UxxajLWwzqY.mp4",
@@ -1118,7 +1099,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  'lang': lang,
                  'v': video_id,
                  'fmt': self._downloader.params.get('subtitlesformat'),
-                'name': l[0],
+                'name': l[0].encode('utf-8'),
              })
              url = u'http://www.youtube.com/api/timedtext?' + params
              sub_lang_list[lang] = url
@@ -1504,7 +1485,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  'subtitles':    video_subtitles,
                  'duration':     video_duration,
                  'age_limit':    18 if age_gate else 0,
-                'annotations':  video_annotations
+                'annotations':  video_annotations,
+                'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
              })
          return results
  
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index 048afc8e7e513f9463747b7fe8f39a94461b4323..75a46a2d55d41171fc6664046ab3631d9e7ae4d3 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
  
-__version__ = '2013.10.28'
+__version__ = '2013.11.02'
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Sun, 3 Nov 2013 11:28:02 +0000 (03:28 -0800)
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Sun, 3 Nov 2013 11:28:02 +0000 (03:28 -0800)
setup.py		patch \| blob \| history
test/helper.py		patch \| blob \| history
test/test_YoutubeDL.py		patch \| blob \| history
test/test_dailymotion_subtitles.py		patch \| blob \| history
test/test_download.py		patch \| blob \| history
test/test_playlists.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/bambuser.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/brightcove.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/dailymotion.py		patch \| blob \| history
youtube_dl/extractor/exfm.py		patch \| blob \| history
youtube_dl/extractor/extremetube.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/keezmovies.py		patch \| blob \| history
youtube_dl/extractor/livestream.py		patch \| blob \| history
youtube_dl/extractor/metacafe.py		patch \| blob \| history
youtube_dl/extractor/mtv.py		patch \| blob \| history
youtube_dl/extractor/myspace.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/vevo.py		patch \| blob \| history
youtube_dl/extractor/vimeo.py		patch \| blob \| history
youtube_dl/extractor/vk.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history