Merge pull request #7792 from jindaxia/fix_sohu_403forbidden
authorSergey M <dstftw@gmail.com>
Tue, 8 Dec 2015 16:54:14 +0000 (22:54 +0600)
committerSergey M <dstftw@gmail.com>
Tue, 8 Dec 2015 16:54:14 +0000 (22:54 +0600)
[sohu] Fix 403 forbidden

README.md
youtube_dl/extractor/cliphunter.py
youtube_dl/extractor/nbc.py
youtube_dl/extractor/theplatform.py

index 0b224f0be3d9bf8a56ef620d9eb96ab7a53ec817..e39f71281ecfa818cc8e4bb60b98cd7c6d18a1ff 100644 (file)
--- a/README.md
+++ b/README.md
@@ -800,7 +800,21 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
 
 Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues> . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the irc channel #youtube-dl on freenode.
 
-**Please include the full output of youtube-dl when run with `-v`**.
+**Please include the full output of youtube-dl when run with `-v`**, i.e. add `-v` flag to your command line, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this:
+```
+$ youtube-dl -v http://www.youtube.com/watch?v=BaW_jenozKcj
+[debug] System config: []
+[debug] User config: []
+[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
+[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
+[debug] youtube-dl version 2015.12.06
+[debug] Git HEAD: 135392e
+[debug] Python version 2.6.6 - Windows-2003Server-5.2.3790-SP2
+[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
+[debug] Proxy map: {}
+...
+```
+**Do not post screenshots of verbose log only plain text is acceptable.**
 
 The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever.
 
index d46592cc5c8c71d30fda96c1b25c6f4a9c55ad75..2996b6b09e81fcd0e04038d1744f2fdf3d54e694 100644 (file)
@@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import determine_ext
+from ..utils import int_or_none
 
 
 _translation_table = {
@@ -42,31 +42,26 @@ class CliphunterIE(InfoExtractor):
         video_title = self._search_regex(
             r'mediaTitle = "([^"]+)"', webpage, 'title')
 
-        fmts = {}
-        for fmt in ('mp4', 'flv'):
-            fmt_list = self._parse_json(self._search_regex(
-                r'var %sjson\s*=\s*(\[.*?\]);' % fmt, webpage, '%s formats' % fmt), video_id)
-            for f in fmt_list:
-                fmts[f['fname']] = _decode(f['sUrl'])
-
-        qualities = self._parse_json(self._search_regex(
-            r'var player_btns\s*=\s*(.*?);\n', webpage, 'quality info'), video_id)
+        gexo_files = self._parse_json(
+            self._search_regex(
+                r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'),
+            video_id)
 
         formats = []
-        for fname, url in fmts.items():
-            f = {
-                'url': url,
-            }
-            if fname in qualities:
-                qual = qualities[fname]
-                f.update({
-                    'format_id': '%s_%sp' % (determine_ext(url), qual['h']),
-                    'width': qual['w'],
-                    'height': qual['h'],
-                    'tbr': qual['br'],
-                })
-            formats.append(f)
-
+        for format_id, f in gexo_files.items():
+            video_url = f.get('url')
+            if not video_url:
+                continue
+            fmt = f.get('fmt')
+            height = f.get('h')
+            format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id
+            formats.append({
+                'url': _decode(video_url),
+                'format_id': format_id,
+                'width': int_or_none(f.get('w')),
+                'height': int_or_none(height),
+                'tbr': int_or_none(f.get('br')),
+            })
         self._sort_formats(formats)
 
         thumbnail = self._search_regex(
index e683d24c45f1d706f728eb90a8a26da880dbafa3..4c1eca96f2b789521f998b05f1d5dcae750cb678 100644 (file)
@@ -11,6 +11,7 @@ from ..utils import (
     ExtractorError,
     find_xpath_attr,
     lowercase_escape,
+    smuggle_url,
     unescapeHTML,
 )
 
@@ -62,12 +63,13 @@ class NBCIE(InfoExtractor):
         theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
             [
                 r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
+                r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
                 r'"embedURL"\s*:\s*"([^"]+)"'
             ],
             webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
         if theplatform_url.startswith('//'):
             theplatform_url = 'http:' + theplatform_url
-        return self.url_result(theplatform_url)
+        return self.url_result(smuggle_url(theplatform_url, {'source_url': url}))
 
 
 class NBCSportsVPlayerIE(InfoExtractor):
index 1555aa77cac30c18de3f0c2db9e13ea00cc569f6..0bf6726b53641734fd0fcafb73a76d8c3621b302 100644 (file)
@@ -16,11 +16,12 @@ from ..compat import (
 from ..utils import (
     determine_ext,
     ExtractorError,
-    xpath_with_ns,
-    unsmuggle_url,
+    float_or_none,
     int_or_none,
+    sanitized_Request,
+    unsmuggle_url,
     url_basename,
-    float_or_none,
+    xpath_with_ns,
 )
 
 default_ns = 'http://www.w3.org/2005/SMIL21/Language'
@@ -204,7 +205,12 @@ class ThePlatformIE(ThePlatformBaseIE):
             smil_url = url
         # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385)
         elif '/guid/' in url:
-            webpage = self._download_webpage(url, video_id)
+            headers = {}
+            source_url = smuggled_data.get('source_url')
+            if source_url:
+                headers['Referer'] = source_url
+            request = sanitized_Request(url, headers=headers)
+            webpage = self._download_webpage(request, video_id)
             smil_url = self._search_regex(
                 r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
                 webpage, 'smil url', group='url')