[minhateca] Add extractor (Fixes #4094)

author Philipp Hagemeister <phihag@phihag.de>

Thu, 4 Dec 2014 16:02:05 +0000 (17:02 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Thu, 4 Dec 2014 16:02:05 +0000 (17:02 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Thu, 4 Dec 2014 16:02:05 +0000 (17:02 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Thu, 4 Dec 2014 16:02:05 +0000 (17:02 +0100)
diff --git a/test/test_utils.py b/test/test_utils.py

index baa3a215657026245bf93960c53374bc3abdd61b..04f1bf283b32abe2421fb782f2765c3c95811c3c 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -376,6 +376,7 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(parse_filesize('2 MiB'), 2097152)
          self.assertEqual(parse_filesize('5 GB'), 5000000000)
          self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
          self.assertEqual(parse_filesize('2 MiB'), 2097152)
          self.assertEqual(parse_filesize('5 GB'), 5000000000)
          self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
+        self.assertEqual(parse_filesize('1,24 KB'), 1240)
  
  if __name__ == '__main__':
      unittest.main()
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 647352b590fc73fac0638579302e3a54bb3a8743..a56ec4fb5b5c877cdfce9cf0029edfd254b0eba8 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -217,6 +217,7 @@ from .mdr import MDRIE
  from .metacafe import MetacafeIE
  from .metacritic import MetacriticIE
  from .mgoon import MgoonIE
  from .metacafe import MetacafeIE
  from .metacritic import MetacriticIE
  from .mgoon import MgoonIE
+from .minhateca import MinhatecaIE
  from .ministrygrid import MinistryGridIE
  from .mit import TechTVMITIE, MITIE, OCWMITIE
  from .mitele import MiTeleIE
  from .ministrygrid import MinistryGridIE
  from .mit import TechTVMITIE, MITIE, OCWMITIE
  from .mitele import MiTeleIE
diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py

new file mode 100644 (file)

index 0000000..077c9b1
--- /dev/null
+++ b/youtube_dl/extractor/minhateca.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+)
+
+
+class MinhatecaIE(InfoExtractor):
+    _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
+    _TEST = {
+        'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
+        'info_dict': {
+            'id': '125848331',
+            'ext': 'mp4',
+            'title': 'youtube-dl test video',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'filesize_approx': 1530000,
+            'duration': 9,
+            'view_count': int,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        token = self._html_search_regex(
+            r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
+            webpage, 'request token')
+        token_data = [
+            ('fileId', video_id),
+            ('__RequestVerificationToken', token),
+        ]
+        req = compat_urllib_request.Request(
+            'http://minhateca.com.br/action/License/Download',
+            data=compat_urllib_parse.urlencode(token_data))
+        req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        data = self._download_json(
+            req, video_id, note='Downloading metadata')
+
+        video_url = data['redirectUrl']
+        title_str = self._html_search_regex(
+            r'<h1.*?>(.*?)</h1>', webpage, 'title')
+        title, _, ext = title_str.rpartition('.')
+        filesize_approx = parse_filesize(self._html_search_regex(
+            r'<p class="fileSize">(.*?)</p>',
+            webpage, 'file size approximation', fatal=False))
+        duration = int_or_none(self._html_search_regex(
+            r'(?s)<p class="fileLeng[ht][th]">.*?([0-9]+)\s*s',
+            webpage, 'duration', fatal=False))
+        view_count = int_or_none(self._html_search_regex(
+            r'<p class="downloadsCounter">([0-9]+)</p>',
+            webpage, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'ext': ext,
+            'filesize_approx': filesize_approx,
+            'duration': duration,
+            'view_count': view_count,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 4d3cbac74aaebdbe0b314690b7dea07e2e2371e2..5e9ae7a426277ef29aeb0cb5901eea30f85ed794 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1090,11 +1090,14 @@ def parse_filesize(s):
      }
  
      units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
      }
  
      units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
-    m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    m = re.match(
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
      if not m:
          return None
  
      if not m:
          return None
  
-    return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
+    num_str = m.group('num').replace(',', '.')
+    mult = _UNIT_TABLE[m.group('unit')]
+    return int(float(num_str) * mult)
  
  
  def get_term_width():
  
  
  def get_term_width():
author	Philipp Hagemeister <phihag@phihag.de>
	Thu, 4 Dec 2014 16:02:05 +0000 (17:02 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Thu, 4 Dec 2014 16:02:05 +0000 (17:02 +0100)
test/test_utils.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/minhateca.py	[new file with mode: 0644]	patch \| blob
youtube_dl/utils.py		patch \| blob \| history