Загрузить файлы в «SYSADOWNLOAD»

2026-03-29 19:16:14 +00:00
parent 1bdc513572
commit 107494f46a
3 changed files with 248 additions and 0 deletions
--- a/SYSADOWNLOAD/main.py
+++ b/SYSADOWNLOAD/main.py
@@ -0,0 +1,172 @@
+import requests
+import os
+import base64
+from typing import List
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup
+
+def embed_images_in_html(session: requests.Session, html_content: str, base_url: str) -> str:
+    """
+    Находит все изображения в HTML, скачивает их и встраивает как base64.
+    
+    Args:
+        session: Сессия requests с куками
+        html_content: Исходный HTML
+        base_url: Базовый URL страницы для разрешения относительных путей
+    
+    Returns:
+        HTML с встроенными изображениями
+    """
+    soup = BeautifulSoup(html_content, 'html.parser')
+    img_tags = soup.find_all('img')
+    
+    print(f"  Найдено изображений: {len(img_tags)}")
+    
+    embedded_count = 0
+    failed_count = 0
+    
+    for idx, img_tag in enumerate(img_tags, 1):
+        img_src = img_tag.get('src')
+        if not img_src:
+            failed_count += 1
+            continue
+        
+        # Формируем полный URL
+        img_url = urljoin(base_url, img_src)
+        
+        # Пропускаем внешние изображения (не с нашего домена) и пустые/якорные ссылки
+        if img_src.startswith('data:') or img_src.startswith('#') or img_src.startswith('javascript:'):
+            continue
+        
+        print(f"    [{idx}/{len(img_tags)}] Встраиваю: {img_url[:70]}...")
+        
+        try:
+            # Скачиваем изображение
+            response = session.get(img_url, timeout=15)
+            response.raise_for_status()
+            
+            # Определяем MIME-тип
+            content_type = response.headers.get('content-type', '')
+            if not content_type:
+                # Определяем по расширению
+                if img_url.lower().endswith('.png'):
+                    content_type = 'image/png'
+                elif img_url.lower().endswith('.jpg') or img_url.lower().endswith('.jpeg'):
+                    content_type = 'image/jpeg'
+                elif img_url.lower().endswith('.gif'):
+                    content_type = 'image/gif'
+                elif img_url.lower().endswith('.webp'):
+                    content_type = 'image/webp'
+                elif img_url.lower().endswith('.svg'):
+                    content_type = 'image/svg+xml'
+                else:
+                    content_type = 'image/png'
+            
+            # Кодируем в base64
+            img_data = base64.b64encode(response.content).decode('utf-8')
+            
+            # Формируем data URI
+            data_uri = f"data:{content_type};base64,{img_data}"
+            
+            # Заменяем src на data URI
+            img_tag['src'] = data_uri
+            embedded_count += 1
+            print(f"      ✓ Встроено ({len(response.content) // 1024} КБ)")
+            
+        except Exception as e:
+            failed_count += 1
+            print(f"      ✗ Ошибка: {type(e).__name__}")
+            # Оставляем оригинальную ссылку (или удаляем тег)
+            # img_tag.decompose()  # раскомментировать чтобы удалить битые изображения
+    
+    print(f"  Итого: {embedded_count} встроено, {failed_count} ошибок")
+    
+    return str(soup)
+
+def download_web_pages(page_ids: List[str], session_cookie_value: str) -> None:
+    """
+    Скачивает страницы и встраивает все изображения в HTML как base64.
+    
+    Args:
+        page_ids: Список ID страниц
+        session_cookie_value: Значение куки MoodleSession
+    """
+    session = requests.Session()
+    
+    # Устанавливаем куку
+    session.cookies.set(
+        "MoodleSession",
+        session_cookie_value,
+        domain="sysahelper.ru",
+        path="/"
+    )
+    
+    # Заголовки для имитации браузера
+    session.headers.update({
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Referer': 'https://sysahelper.ru/',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
+        'Connection': 'keep-alive'
+    })
+    
+    base_url_template = "https://sysahelper.ru/mod/page/view.php?id="
+    output_folder = "downloaded_pages"
+    os.makedirs(output_folder, exist_ok=True)
+    
+    print(f"Начинаю скачивание {len(page_ids)} страниц с встраиванием изображений...\n")
+    
+    for idx, page_id in enumerate(page_ids, 1):
+        page_url = f"{base_url_template}{page_id}"
+        print(f"[{idx}/{len(page_ids)}] Скачиваю: {page_url}")
+        
+        try:
+            # Скачиваем страницу
+            response = session.get(page_url, timeout=15)
+            print(f"  Статус: {response.status_code}")
+            
+            if response.status_code != 200:
+                print(f"  ✗ Ошибка HTTP {response.status_code}\n")
+                continue
+            
+            # Проверка авторизации
+            if 'logout' in response.text.lower() or '/login/logout.php' in response.text:
+                print(f"  ✓ Авторизован")
+            else:
+                print(f"  ⚠ Возможно, нет доступа к материалам")
+            
+            # Встраиваем изображения
+            modified_html = embed_images_in_html(session, response.text, page_url)
+            
+            # Сохраняем в один файл
+            filename = os.path.join(output_folder, f"page_{page_id}_embedded.html")
+            with open(filename, 'w', encoding='utf-8') as f:
+                f.write(modified_html)
+            
+            # Показываем размер файла
+            file_size = os.path.getsize(filename) / 1024
+            print(f"  ✓ Сохранено: {filename} ({file_size:.1f} КБ)\n")
+            
+        except Exception as e:
+            print(f"  ✗ Ошибка: {type(e).__name__}: {e}\n")
+    
+    print("="*60)
+    print("Готово! Все страницы сохранены с встроенными изображениями.")
+    print(f"Откройте файлы из папки '{output_folder}' в браузере.")
+    print("="*60)
+
+if __name__ == "__main__":
+    # Ваши идентификаторы страниц
+    page_ids = [
+        "719",
+        "705",
+        "720",
+        "724",
+        "722"
+    ]
+    
+    # Ваша сессионная кука Moodle
+    SESSION_COOKIE_VALUE = "tfknk5e4lkucb9rj1ec577qh3q"
+    
+    # Скачиваем страницы с встроенными изображениями
+    download_web_pages(page_ids, SESSION_COOKIE_VALUE)