获取纯域名和扩展名

时间:2014-12-19 17:33:04

标签: php string url dns extract

我在google和stackoverflow中搜索了很多次,我发现了很多问题和答案,但我发现所有错误都存在。

所以我试着编写自己的算法,我不确定。看起来很有效。 我使用了一个infinte循环。这是一个问题吗?如果是,如何避免这样的循环。

如果我的代码中存在任何错误,我需要你的帮助。

注意:英语不是我的母语。

最诚挚的问候。

/**
     * @description get pure domain or domain.extension
     * @param $p_domain
     * @param $ext :: if 1 => with extension
     * @return string|false
     */
    public function get_pure_domain($p_domain, $ext=1)
    {

        //remove protocols
        $clean = array("http://", "https://", "ftp://", "@", "ftps://", ":");
        $domain = str_replace($clean, "", $p_domain);

        //remove slashes before dot
        $dot_len=strpos($domain,'.');
        if ( strpos( substr($domain,0,$dot_len) ,'/')) return false;

        if (!$dot_len)
        return false;


        // if slashes exist remove after
        if (strpos($domain, "/"))
            $domain=substr($domain, 0, strpos($domain, "/") );


        $clean_domain=$domain;

        // explode with dot
        $items = explode(".", $domain);

        //select probable domain without extension
        $sub_domain_count=count($items)-2;
        $pure_domain=$items[$sub_domain_count];


        //if selected domain matched an extension, select provious (go back -1 in array)

        $d_count=0;
        while(1==1)
        {
            $d_count++;

            if ( $this->validate_ext($pure_domain) )
                $pure_domain=$items[--$sub_domain_count];
            else
                break;

            //break infinite loop after 50
            if ($d_count==50) break;

        }

        // if requested, return with extension
        if ($ext==1)
        {
            $extension_start=strpos($clean_domain,$pure_domain)+strlen($pure_domain);
            $pure_domain=$pure_domain.substr($clean_domain,$extension_start);
        }


        //total parts cannot be greater than 3
        $items = explode(".", $pure_domain);
        if(count($items)>2)
        {
            $pure_domain = $items[count($items) - 3] . '.' . $items[count($items) - 2] . '.' . $items[count($items) - 1];

            //search for last two parts in extensions if not exist return these two parts as domain
            if ( !$this->validate_ext($items[count($items) - 2] . '.' . $items[count($items) - 1]) )
            {
                if ($ext==1)
                    $pure_domain=$items[count($items) - 2] . '.' . $items[count($items) - 1];
                else
                    $pure_domain=$items[count($items) - 2];
            }

        }

        //if extensions and domain are same algorithm may take subdomain as domain
        $items = explode(".", $pure_domain);
        if(count($items)>1)
        {
            if ($items[count($items) - 1] == $items[count($items) - 2])
            {
                if ($ext==1)
                    $pure_domain = $items[count($items) - 2] . '.' . $items[count($items) - 1];
                else
                    $pure_domain=$items[count($items) - 2];
            }
        }


        //if domain numeric may be an ip return false
        if (is_numeric($pure_domain))
            return false;

        //return result
        return $pure_domain;

    }

public function validate_ext($ext)
    {
        if (strpos($ext,".")) str_replace('.','',$ext);

        if (in_array($ext, Data::$extensions))
            return true;
        else
            return false;

    }

class Data
{
    //18-12-2014 all domains extension
    public static $extensions = array("abogado", "ac", "academy", "accountants", "active", "actor", "ad", "adult", "ae", "aero", "af", "ag", "agency", "ai",
        "airforce", "al", "allfinanz", "alsace", "am", "an", "android", "ao", "aq", "aquarelle", "ar", "archi", "army", "arpa", "as", "asia", "associates",
        "at", "attorney", "au", "auction", "audio", "autos", "aw", "ax", "axa", "az", "ba", "band", "bar", "bargains", "bayern", "bb", "bd", "be", "beer", "berlin",
        "best", "bf", "bg", "bh", "bi", "bid", "bike", "bio", "biz", "bj", "black", "blackfriday", "bloomberg", "blue", "bm", "bmw", "bn", "bnpparibas", "bo", "boo",
        "boutique", "br", "brussels", "bs", "bt", "budapest", "build", "builders", "business", "buzz", "bv", "bw", "by", "bz", "bzh", "ca", "cab", "cal", "camera",
        "camp", "cancerresearch", "capetown", "capital", "caravan", "cards", "care", "career", "careers", "cartier", "casa", "cash", "cat", "catering", "cc",
        "cd", "center", "ceo", "cern", "cf", "cg", "ch", "channel", "cheap", "christmas", "chrome", "church", "ci", "citic", "city", "ck", "cl", "claims", "cleaning",
        "click", "clinic", "clothing", "club", "cm", "cn", "co", "coach", "codes", "coffee", "college", "cologne", "com", "community", "company", "computer", "condos",
        "construction", "consulting", "contractors", "cooking", "cool", "coop", "country", "cr", "credit", "creditcard", "cricket", "crs", "cruises", "cu",
        "cuisinella", "cv", "cw", "cx", "cy", "cymru", "cz", "dad", "dance", "dating", "day", "de", "deals", "degree", "delivery", "democrat", "dental", "dentist",
        "desi", "diamonds", "diet", "digital", "direct", "directory", "discount", "dj", "dk", "dm", "dnp", "do", "domains", "doosan", "durban", "dvag", "dz", "eat",
        "ec", "edu", "education", "ee", "eg", "email", "emerck", "energy", "engineer", "engineering", "enterprises", "equipment", "er", "es", "esq", "estate", "et",
        "eu", "eurovision", "eus", "events", "everbank", "exchange", "expert", "exposed", "fail", "farm", "fashion", "feedback", "fi", "finance", "financial",
        "firmdale", "fish", "fishing", "fitness", "fj", "fk", "flights", "florist", "flsmidth", "fly", "fm", "fo", "foo", "forsale", "foundation", "fr", "frl",
        "frogans", "fund", "furniture", "futbol", "ga", "gal", "gallery", "garden", "gb", "gbiz", "gd", "ge", "gent", "gf", "gg", "gh", "gi", "gift", "gifts", "gives",
        "gl", "glass", "gle", "global", "globo", "gm", "gmail", "gmo", "gmx", "gn", "google", "gop", "gov", "gp", "gq", "gr", "graphics", "gratis", "green", "gripe",
        "gs", "gt", "gu", "guide", "guitars", "guru", "gw", "gy", "hamburg", "haus", "healthcare", "help", "here", "hiphop", "hiv", "hk", "hm", "hn", "holdings",
        "holiday", "homes", "horse", "host", "hosting", "house", "how", "hr", "ht", "hu", "ibm", "id", "ie", "il", "im", "immo", "immobilien", "in", "industries",
        "info", "ing", "ink", "institute", "insure", "int", "international", "investments", "io", "iq", "ir", "irish", "is", "it", "iwc", "je", "jetzt", "jm", "jo",
        "jobs", "joburg", "jp", "juegos", "kaufen", "ke", "kg", "kh", "ki", "kim", "kitchen", "kiwi", "km", "kn", "koeln", "kp", "kr", "krd", "kred", "kw", "ky", "kz",
        "la", "lacaixa", "land", "latrobe", "lawyer", "lb", "lc", "lds", "lease", "legal", "lgbt", "li", "lidl", "life", "lighting", "limited", "limo", "link", "lk",
        "loans", "london", "lotto", "lr", "ls", "lt", "ltda", "lu", "luxe", "luxury", "lv", "ly", "ma", "madrid", "maison", "management", "mango", "market", "marketing",
        "mc", "md", "me", "media", "meet", "melbourne", "meme", "memorial", "menu", "mg", "mh", "miami", "mil", "mini", "mk", "ml", "mm", "mn", "mo", "mobi", "moda",
        "moe", "monash", "money", "mormon", "mortgage", "moscow", "motorcycles", "mov", "mp", "mq", "mr", "ms", "mt", "mu", "museum", "mv", "mw", "mx", "my", "mz", "na",
        "nagoya", "name", "navy", "nc", "ne", "net", "network", "neustar", "new", "nexus", "nf", "ng", "ngo", "nhk", "ni", "ninja", "nl", "no", "np", "nr", "nra", "nrw",
        "nu", "nyc", "nz", "okinawa", "om", "ong", "onl", "ooo", "org", "organic", "osaka", "otsuka", "ovh", "pa", "paris", "partners", "parts", "party", "pe", "pf", "pg",
        "ph", "pharmacy", "photo", "photography", "photos", "physio", "pics", "pictures", "pink", "pizza", "pk", "pl", "place", "plumbing", "pm", "pn", "pohl", "poker",
        "porn", "post", "pr", "praxi", "press", "pro", "prod", "productions", "prof", "properties", "property", "ps", "pt", "pub", "pw", "py", "qa", "qpon", "quebec",
        "re", "realtor", "recipes", "red", "rehab", "reise", "reisen", "reit", "ren", "rentals", "repair", "report", "republican", "rest", "restaurant", "reviews",
        "rich", "rio", "rip", "ro", "rocks", "rodeo", "rs", "rsvp", "ru", "ruhr", "rw", "ryukyu", "sa", "saarland", "samsung", "sarl", "sb", "sc", "sca", "scb", "schmidt",
        "schule", "schwarz", "science", "scot", "sd", "se", "services", "sew", "sexy", "sg", "sh", "shiksha", "shoes", "si", "singles", "sj", "sk", "sky", "sl", "sm", "sn",
        "so", "social", "software", "sohu", "solar", "solutions", "soy", "space", "spiegel", "sr", "st", "su", "supplies", "supply", "support", "surf", "surgery",
        "suzuki", "sv", "sx", "sy", "sydney", "systems", "sz", "taipei", "tatar", "tattoo", "tax", "tc", "td", "technology", "tel", "tf", "tg", "th", "tienda", "tips",
        "tirol", "tj", "tk", "tl", "tm", "tn", "to", "today", "tokyo", "tools", "top", "town", "toys", "tp", "tr", "trade", "training", "travel", "trust", "tt", "tui",
        "tv", "tw", "tz", "ua", "ug", "uk", "university", "uno", "uol", "us", "uy", "uz", "va", "vacations", "vc", "ve", "vegas", "ventures", "versicherung", "vet", "vg",
        "vi", "viajes", "villas", "vision", "vlaanderen", "vn", "vodka", "vote", "voting", "voto", "voyage", "vu", "wales", "wang", "watch", "webcam", "website",
        "wed", "wedding", "wf", "whoswho", "wien", "wiki", "williamhill", "wme", "work", "works", "world", "ws", "wtc", "wtf", "xn--1qqw23a", "xn--3bst00m",
        "xn--3ds443g", "xn--3e0b707e", "xn--45brj9c", "xn--45q11c", "xn--4gbrim", "xn--55qw42g", "xn--55qx5d", "xn--6frz82g", "xn--6qq986b3xl", "xn--80adxhks",
        "xn--80ao21a", "xn--80asehdb", "xn--80aswg", "xn--90a3ac", "xn--c1avg", "xn--cg4bki", "xn--clchc0ea0b2g2a9gcd", "xn--czr694b", "xn--czrs0t",
        "xn--czru2d", "xn--d1acj3b", "xn--d1alf", "xn--fiq228c5hs", "xn--fiq64b", "xn--fiqs8s", "xn--fiqz9s", "xn--flw351e", "xn--fpcrj9c3d", "xn--fzc2c9e2c",
        "xn--gecrj9c", "xn--h2brj9c", "xn--hxt814e", "xn--i1b6b1a6a2e", "xn--io0a7i", "xn--j1amh", "xn--j6w193g", "xn--kprw13d", "xn--kpry57d", "xn--kput3i",
        "xn--l1acc", "xn--lgbbat1ad8j", "xn--mgb9awbf", "xn--mgba3a4f16a", "xn--mgbaam7a8h", "xn--mgbab2bd", "xn--mgbayh7gpa", "xn--mgbbh1a71e",
        "xn--mgbc0a9azcg", "xn--mgberp4a5d4ar", "xn--mgbx4cd0ab", "xn--ngbc5azd", "xn--node", "xn--nqv7f", "xn--nqv7fs00ema", "xn--o3cw4h", "xn--ogbpf8fl",
        "xn--p1acf", "xn--p1ai", "xn--pgbs0dh", "xn--q9jyb4c", "xn--qcka1pmc", "xn--rhqv96g", "xn--s9brj9c", "xn--ses554g", "xn--unup4y",
        "xn--vermgensberater-ctb", "xn--vermgensberatung-pwb", "xn--vhquv", "xn--wgbh1c", "xn--wgbl6a", "xn--xhq521b", "xn--xkc2al3hye2a",
        "xn--xkc2dl3a5ee0h", "xn--yfro4i67o", "xn--ygbi2ammx", "xn--zfr164b", "xxx", "xyz", "yachts", "yandex", "ye", "yoga", "yokohama", "youtube", "yt", "za",
        "zip", "zm", "zone", "zw");

}

1 个答案:

答案 0 :(得分:1)

没有人回答我的问题所以我改进了我的方法,我会像这样使用:

public function get_pure_domain($p_domain, $ext=1)
    {

        //remove protocols
        $clean = array("http://", "https://", "ftp://", "@", "ftps://", ":");
        $domain = str_replace($clean, "", $p_domain);

        //remove slashes before dot
        $dot_len=strpos($domain,'.');
        if ( strpos( substr($domain,0,$dot_len) ,'/')) return false;

        if (!$dot_len)
        return false;


        // if slashes exist remove after
        if (strpos($domain, "/"))
            $domain=substr($domain, 0, strpos($domain, "/") );


        $clean_domain=$domain;

        // explode with dot
        $items = explode(".", $domain);

        //select probable domain without extension
        $sub_domain_count=count($items)-2;
        $pure_domain=$items[$sub_domain_count];


        //if selected domain matched an extension, select provious (go back -1 in array)

        $d_count=0;
        while(1==1)
        {
            $d_count++;

            if ( $this->validate_ext($pure_domain) )
                $pure_domain=$items[--$sub_domain_count];
            else
                break;

            //break infinite loop after 50
            if ($d_count==50) break;

        }

        // if requested, return with extension
        if ($ext==1)
        {
            $extension_start=strpos($clean_domain,$pure_domain)+strlen($pure_domain);
            $pure_domain=$pure_domain.substr($clean_domain,$extension_start);
        }


        //total parts cannot be greater than 3
        $items = explode(".", $pure_domain);
        if(count($items)>2)
        {
            $pure_domain = $items[count($items) - 3] . '.' . $items[count($items) - 2] . '.' . $items[count($items) - 1];

            //search for last two parts in extensions if not exist return these two parts as domain
            if ( !$this->validate_ext($items[count($items) - 2] . '.' . $items[count($items) - 1]) )
            {
                if ($ext==1)
                    $pure_domain=$items[count($items) - 2] . '.' . $items[count($items) - 1];
                else
                    $pure_domain=$items[count($items) - 2];
            }

        }

        //if extensions and domain are same algorithm may take subdomain as domain
        $items = explode(".", $pure_domain);
        if(count($items)>1)
        {
            if ($items[count($items) - 1] == $items[count($items) - 2])
            {
                if ($ext==1)
                    $pure_domain = $items[count($items) - 2] . '.' . $items[count($items) - 1];
                else
                    $pure_domain=$items[count($items) - 2];
            }
        }


        //if domain numeric may be an ip return false
        if (is_numeric($pure_domain))
            return false;

        //return result
        return $pure_domain;

    }