<?php
header("Access-Control-Allow-Origin: *");
// Function to get meta tags content
function get_meta_tags_content($url, $tag) {
    $content = @file_get_contents($url);
    if ($content === false) return '';
    $content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8");
    preg_match('/<meta ' . $tag . ' content="(.*?)"/', $content, $matches);
    return html_entity_decode($matches[1] ?? '', ENT_QUOTES | ENT_HTML5, 'UTF-8');
}

// Function to get image URLs
function get_image_url($url, $tag) {
    $content = @file_get_contents($url);
    if ($content === false) return '';
    $content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8");
    preg_match('/<meta ' . $tag . ' content="(.*?)"/', $content, $matches);
    return $matches[1] ?? '';
}

function get_article_content($url) {
    $html = fetch_url_content($url);
    if (is_string($html) && strpos($html, 'Lỗi:') === 0) {
        return json_encode(['error' => $html]); // Trả về JSON thông báo lỗi
    }


    $doc = new DOMDocument();
    @$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"));
    
    remove_unwanted_elements($doc);
    
    $xpath = new DOMXPath($doc);
    $best_content = find_main_content($xpath);
    
    if (!$best_content) {
        $article_content = fallback_extraction($html);
    } else {
        $article_content = extract_content($xpath, $best_content);
    }
    
    if (empty(trim($article_content))) {
        $article_content = fallback_regex_extraction($html);
    }
    
    $article_content = clean_content($article_content);
    
    return $article_content;
}

function fetch_url_content($url) {
    $ch = curl_init();
    curl_setopt_array($ch, [
        CURLOPT_URL => $url,
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_FOLLOWLOCATION => true,
        CURLOPT_MAXREDIRS => 5,
        CURLOPT_TIMEOUT => 30,
        CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        CURLOPT_SSL_VERIFYPEER => false,
    ]);
    
    $html = curl_exec($ch);
    
    if ($html === false) {
        return "Lỗi: " . curl_error($ch);
    }
    
    curl_close($ch);
    return $html;
}

function remove_unwanted_elements($doc) {
    $unwanted_tags = ['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript', 'form'];
    foreach ($unwanted_tags as $tag) {
        $elements = $doc->getElementsByTagName($tag);
        for ($i = $elements->length; --$i >= 0; ) {
            $element = $elements->item($i);
            $element->parentNode->removeChild($element);
        }
    }
}

function find_main_content($xpath) {
    $content_queries = [
        "//main", "//article", "//div[contains(@class, 'content')]", "//div[contains(@class, 'post')]",
        "//div[contains(@class, 'entry')]", "//div[@id='content']", "//div[@id='main']",
        "//section[contains(@class, 'content')]", "//div[contains(@class, 'article')]",
        "//div[contains(@class, 'blog-post')]", "//div[@role='main']", "//div[contains(@class, 'main-content')]",
        "//div[contains(@class, 'page-content')]", "//div[contains(@class, 'single-post')]",
        "//div[contains(@class, 'post-content')]", "//div[contains(@class, 'entry-content')]",
        "//div[@itemprop='articleBody']", "//div[@itemtype='http://schema.org/Article']",
        "//div[@itemtype='http://schema.org/BlogPosting']", "//div[contains(@class, 'story-body')]",
        "//div[contains(@class, 'story-content')]"
    ];
    
    $best_content = null;
    $max_score = 0;
    
    foreach ($content_queries as $query) {
        $elements = $xpath->query($query);
        foreach ($elements as $element) {
            $score = score_element($element);
            if ($score > $max_score) {
                $max_score = $score;
                $best_content = $element;
            }
        }
        if ($best_content) break;
    }
    
    return $best_content;
}

function score_element($element) {
    $text = $element->textContent;
    $word_count = str_word_count($text);
    $link_density = calculate_link_density($element);
    $paragraph_count = $element->getElementsByTagName('p')->length;
    
    return $word_count * (1 - $link_density) + ($paragraph_count * 20);
}

function calculate_link_density($element) {
    $link_length = 0;
    $links = $element->getElementsByTagName('a');
    foreach ($links as $link) {
        $link_length += strlen($link->textContent);
    }
    $text_length = strlen($element->textContent);
    return $text_length > 0 ? $link_length / $text_length : 0;
}

function extract_content($xpath, $best_content) {
$elements = $xpath->query(".//p|.//img|.//h1|.//h2|.//h3|.//h4|.//h5|.//h6|.//ul|.//ol|.//blockquote|.//table|.//iframe", $best_content);
$article_content = "";
foreach ($elements as $element) {
    if ($element->nodeName === 'img') {
        // Danh sách các thuộc tính có thể chứa URL hình ảnh
        $srcAttributes = ['src', 'data-src', 'data-lazy-src', 'data-original', 'data-srcset'];
        $src = '';

        // Tìm URL hình ảnh từ các thuộc tính
        foreach ($srcAttributes as $attr) {
            $src = $element->getAttribute($attr);
            if ($src && !preg_match('/^data:image\/.*base64/i', $src)) {
                break;
            }
        }

        // Nếu src là base64 hoặc placeholder, tìm link ảnh thực từ các thuộc tính khác
        if (!$src || preg_match('/^data:image\/.*base64/i', $src)) {
            foreach ($srcAttributes as $attr) {
                $potentialSrc = $element->getAttribute($attr);
                if ($potentialSrc && !preg_match('/^data:image\/.*base64/i', $potentialSrc)) {
                    $src = $potentialSrc;
                    break;
                }
            }
        }

        // Nếu vẫn không tìm thấy src, kiểm tra srcset
        if (!$src && $element->hasAttribute('srcset')) {
            $srcset = $element->getAttribute('srcset');
            $srcsetItems = explode(',', $srcset);
            if (!empty($srcsetItems)) {
                $firstItem = trim($srcsetItems[0]);
                $src = explode(' ', $firstItem)[0];
            }
        }

        // Nếu tìm thấy src, thêm thẻ img vào nội dung
        if ($src) {
            // Chuyển đổi URL tương đối thành tuyệt đối nếu cần
            if (!preg_match("~^(?:f|ht)tps?://~i", $src) && !preg_match('/^data:image/i', $src)) {
                $src = rtrim($base_url, '/') . '/' . ltrim($src, '/');
            }

            $alt = $element->getAttribute('alt') ?: '';
            $article_content .= "<img src=\"{$src}\" alt=\"{$alt}\">\n";
        }
    } else {
        $article_content .= $element->ownerDocument->saveHTML($element) . "\n";
    }
}
    
    return $article_content;
}

function fallback_extraction($html) {
    $body = preg_replace('/<(script|style).*?<\/\\1>/is', '', $html);
    $body = preg_replace('/<.*?>/s', ' ', $body);
    $body = preg_replace('/\s+/', ' ', $body);
    
    $sentences = preg_split('/(?<=[.!?])\s+/', $body, -1, PREG_SPLIT_NO_EMPTY);
    
    $content_start = 0;
    $content_end = count($sentences);
    
    // Tìm điểm bắt đầu của nội dung chính
    for ($i = 0; $i < count($sentences); $i++) {
        if (str_word_count($sentences[$i]) > 15) {
            $content_start = $i;
            break;
        }
    }
    
    // Tìm điểm kết thúc của nội dung chính
    for ($i = count($sentences) - 1; $i >= 0; $i--) {
        if (str_word_count($sentences[$i]) > 15) {
            $content_end = $i + 1;
            break;
        }
    }
    
    $main_content = implode(' ', array_slice($sentences, $content_start, $content_end - $content_start));
    return $main_content;
}

function fallback_regex_extraction($html) {
    preg_match('/<body.*?>(.*?)<\/body>/is', $html, $matches);
    if (empty($matches[1])) return '';
    
    $body_content = $matches[1];
    $body_content = preg_replace('/<(script|style|nav|header|footer).*?<\/\\1>/is', '', $body_content);
    $paragraphs = preg_split('/<\/?(p|div|h[1-6]|article|section)[^>]*>/i', $body_content);
    
    $meaningful_paragraphs = array_filter($paragraphs, function($p) {
        $p = trim(strip_tags($p));
        return strlen($p) > 100 && !preg_match('/^(copyright|all rights reserved)/i', $p);
    });
    
    return implode("\n\n", $meaningful_paragraphs);
}

function clean_content($content) {
    $content = preg_replace('/<p>\s*<\/p>/', '', $content);
    $content = preg_replace('/\s+/', ' ', $content);
    $content = preg_replace('/<\/(p|h[1-6])>/', "</$1>\n\n", $content);
    $content = preg_replace('/<!--.*?-->/s', '', $content);
    return trim($content);
}

// Get the URL from the request
$url = $_GET['url'] ?? '';

if (!$url) {
    http_response_code(400); // Bad Request
    echo json_encode(['error' => 'URL parameter is missing']);
    exit;
}

// Get meta description
$description = get_meta_tags_content($url, 'property="og:description"');
if (!$description) {
    $description = get_meta_tags_content($url, 'name="description"');
}

// Get title
$content = $url ? @file_get_contents($url) : '';
$content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8");
preg_match('/<title>(.*?)<\/title>/', $content, $title_matches);
$title = html_entity_decode($title_matches[1] ?? '', ENT_QUOTES | ENT_HTML5, 'UTF-8');

// Get image URL (prioritize og:image)
$image = get_image_url($url, 'property="og:image"'); 
if (!$image) {
    $image = get_image_url($url, 'property="og:image:url"');
}

// Phần cuối của api-leech.php
$content = get_article_content($url);

// Kiểm tra nếu $content là một mảng JSON chứa lỗi
if (is_string($content) && strpos($content, '{') === 0) {
    echo $content; // Trả về JSON thông báo lỗi
} else {
    // Trả về kết quả như bình thường
    header('Content-Type: application/json');
    echo json_encode([
        'title' => $title,
        'description' => $description,
        'image'       => $image,
        'content'   => $content
    ]);
}