[extractor/generic] Improve kaltura embeds support (Closes #6137)

[youtube-dl] / youtube_dl / extractor / vk.py
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py

index 6aeba109dc515392f8fcf0c680594c48c39dff80..f2ae109f99ddc2df40d43b6a25f311b22431f8af 100644 (file)
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -13,6 +13,7 @@ from ..compat import (
  from ..utils import (
      ExtractorError,
      orderedSet,
+    str_to_int,
      unescapeHTML,
      unified_strdate,
  )
@@ -34,6 +35,7 @@ class VKIE(InfoExtractor):
                  'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
                  'duration': 195,
                  'upload_date': '20120212',
+                'view_count': int,
              },
          },
          {
@@ -45,7 +47,8 @@ class VKIE(InfoExtractor):
                  'uploader': 'Tom Cruise',
                  'title': 'No name',
                  'duration': 9,
-                'upload_date': '20130721'
+                'upload_date': '20130721',
+                'view_count': int,
              }
          },
          {
@@ -59,6 +62,7 @@ class VKIE(InfoExtractor):
                  'title': 'Lin Dan',
                  'duration': 101,
                  'upload_date': '20120730',
+                'view_count': int,
              }
          },
          {
@@ -73,7 +77,8 @@ class VKIE(InfoExtractor):
                  'uploader': 'Триллеры',
                  'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
                  'duration': 8352,
-                'upload_date': '20121218'
+                'upload_date': '20121218',
+                'view_count': int,
              },
              'skip': 'Requires vk account credentials',
          },
@@ -100,6 +105,7 @@ class VKIE(InfoExtractor):
                  'title': 'Книга Илая',
                  'duration': 6771,
                  'upload_date': '20140626',
+                'view_count': int,
              },
              'skip': 'Only works from Russia',
          },
@@ -115,20 +121,27 @@ class VKIE(InfoExtractor):
          if username is None:
              return
  
-        login_form = {
-            'act': 'login',
-            'role': 'al_frame',
-            'expire': '1',
+        login_page = self._download_webpage(
+            'https://vk.com', None, 'Downloading login page')
+
+        login_form = dict(re.findall(
+            r'<input\s+type="hidden"\s+name="([^"]+)"\s+(?:id="[^"]+"\s+)?value="([^"]*)"',
+            login_page))
+
+        login_form.update({
              'email': username.encode('cp1251'),
              'pass': password.encode('cp1251'),
-        }
+        })
  
-        request = compat_urllib_request.Request('https://login.vk.com/?act=login',
-                                                compat_urllib_parse.urlencode(login_form).encode('utf-8'))
-        login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+        request = compat_urllib_request.Request(
+            'https://login.vk.com/?act=login',
+            compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+        login_page = self._download_webpage(
+            request, None, note='Logging in as %s' % username)
  
          if re.search(r'onLoginFailed', login_page):
-            raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
+            raise ExtractorError(
+                'Unable to login, incorrect username and/or password', expected=True)
  
      def _real_initialize(self):
          self._login()
@@ -189,11 +202,15 @@ class VKIE(InfoExtractor):
  
          # Extract upload date
          upload_date = None
-        mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
+        mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page)
          if mobj is not None:
              mobj.group(1) + ' ' + mobj.group(2)
              upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2))
  
+        view_count = str_to_int(self._search_regex(
+            r'"mv_views_count_number"[^>]*>([\d,.]+) views<',
+            info_page, 'view count', fatal=False))
+
          formats = [{
              'format_id': k,
              'url': v,
@@ -210,6 +227,7 @@ class VKIE(InfoExtractor):
              'uploader': data.get('md_author'),
              'duration': data.get('duration'),
              'upload_date': upload_date,
+            'view_count': view_count,
          }