[criterion] Rely on _match_id, improve regex and add thumbnail to test

[youtube-dl] / youtube_dl / extractor / vk.py
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py

index bcb7df83d1c64308d0a78e58e4837dbfef7cd0a5..f26e0732c2b0693456acec3e9fb2390b36016d97 100644 (file)
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -1,6 +1,7 @@
  # encoding: utf-8
  from __future__ import unicode_literals
  
+import collections
  import re
  import json
  import sys
@@ -16,15 +17,15 @@ from ..utils import (
      get_element_by_class,
      int_or_none,
      orderedSet,
-    parse_duration,
      remove_start,
      str_to_int,
      unescapeHTML,
      unified_strdate,
      urlencode_postdata,
  )
-from .vimeo import VimeoIE
+from .dailymotion import DailymotionIE
  from .pladform import PladformIE
+from .vimeo import VimeoIE
  
  
  class VKBaseIE(InfoExtractor):
@@ -52,14 +53,17 @@ class VKBaseIE(InfoExtractor):
          # what actually happens.
          # We will workaround this VK issue by resetting the remixlhk cookie to
          # the first one manually.
-        cookies = url_handle.headers.get('Set-Cookie')
-        if sys.version_info[0] >= 3:
-            cookies = cookies.encode('iso-8859-1')
-        cookies = cookies.decode('utf-8')
-        remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
-        if remixlhk:
-            value, domain = remixlhk.groups()
-            self._set_cookie(domain, 'remixlhk', value)
+        for header, cookies in url_handle.headers.items():
+            if header.lower() != 'set-cookie':
+                continue
+            if sys.version_info[0] >= 3:
+                cookies = cookies.encode('iso-8859-1')
+            cookies = cookies.decode('utf-8')
+            remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
+            if remixlhk:
+                value, domain = remixlhk.groups()
+                self._set_cookie(domain, 'remixlhk', value)
+                break
  
          login_page = self._download_webpage(
              'https://login.vk.com/?act=login', None,
@@ -207,6 +211,23 @@ class VKIE(VKBaseIE):
                  'view_count': int,
              },
          },
+        {
+            # dailymotion embed
+            'url': 'https://vk.com/video-37468416_456239855',
+            'info_dict': {
+                'id': 'k3lz2cmXyRuJQSjGHUv',
+                'ext': 'mp4',
+                'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
+                'description': 'md5:c651358f03c56f1150b555c26d90a0fd',
+                'uploader': 'AniLibria.Tv',
+                'upload_date': '20160914',
+                'uploader_id': 'x1p5vl5',
+                'timestamp': 1473877246,
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
          {
              # video key is extra_data not url\d+
              'url': 'http://vk.com/video-110305615_171782105',
@@ -312,6 +333,10 @@ class VKIE(VKBaseIE):
                  m_rutube.group(1).replace('\\', ''))
              return self.url_result(rutube_url)
  
+        dailymotion_urls = DailymotionIE._extract_urls(info_page)
+        if dailymotion_urls:
+            return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
+
          m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
          if m_opts:
              m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
@@ -444,6 +469,9 @@ class VKWallPostIE(VKBaseIE):
                  'skip_download': True,
              },
          }],
+        'params': {
+            'usenetrc': True,
+        },
          'skip': 'Requires vk account credentials',
      }, {
          # single YouTube embed, no leading -
@@ -453,6 +481,9 @@ class VKWallPostIE(VKBaseIE):
              'title': 'Sergey Gorbunov - Wall post 85155021_6319',
          },
          'playlist_count': 1,
+        'params': {
+            'usenetrc': True,
+        },
          'skip': 'Requires vk account credentials',
      }, {
          # wall page URL
@@ -480,37 +511,41 @@ class VKWallPostIE(VKBaseIE):
              raise ExtractorError('VK said: %s' % error, expected=True)
  
          description = clean_html(get_element_by_class('wall_post_text', webpage))
-        uploader = clean_html(get_element_by_class(
-            'fw_post_author', webpage)) or self._og_search_description(webpage)
+        uploader = clean_html(get_element_by_class('author', webpage))
          thumbnail = self._og_search_thumbnail(webpage)
  
          entries = []
  
-        for audio in re.finditer(r'''(?sx)
-                            <input[^>]+
-                                id=(?P<q1>["\'])audio_info(?P<id>\d+_\d+).*?(?P=q1)[^>]+
-                                value=(?P<q2>["\'])(?P<url>http.+?)(?P=q2)
-                                .+?
-                            </table>''', webpage):
-            audio_html = audio.group(0)
-            audio_id = audio.group('id')
-            duration = parse_duration(get_element_by_class('duration', audio_html))
-            track = self._html_search_regex(
-                r'<span[^>]+id=["\']title%s[^>]*>([^<]+)' % audio_id,
-                audio_html, 'title', default=None)
-            artist = self._html_search_regex(
-                r'>([^<]+)</a></b>\s*&ndash', audio_html,
-                'artist', default=None)
-            entries.append({
-                'id': audio_id,
-                'url': audio.group('url'),
-                'title': '%s - %s' % (artist, track) if artist and track else audio_id,
-                'thumbnail': thumbnail,
-                'duration': duration,
-                'uploader': uploader,
-                'artist': artist,
-                'track': track,
-            })
+        audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage)
+        if audio_ids:
+            al_audio = self._download_webpage(
+                'https://vk.com/al_audio.php', post_id,
+                note='Downloading audio info', fatal=False,
+                data=urlencode_postdata({
+                    'act': 'reload_audio',
+                    'al': '1',
+                    'ids': ','.join(audio_ids)
+                }))
+            if al_audio:
+                Audio = collections.namedtuple(
+                    'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration'])
+                audios = self._parse_json(
+                    self._search_regex(
+                        r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'),
+                    post_id, fatal=False, transform_source=unescapeHTML)
+                if isinstance(audios, list):
+                    for audio in audios:
+                        a = Audio._make(audio[:6])
+                        entries.append({
+                            'id': '%s_%s' % (a.user_id, a.id),
+                            'url': a.url,
+                            'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id,
+                            'thumbnail': thumbnail,
+                            'duration': a.duration,
+                            'uploader': uploader,
+                            'artist': a.artist,
+                            'track': a.track,
+                        })
  
          for video in re.finditer(
                  r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):