Return Base Domain from URL (String Stuff)

Example:

Dim strURL as string
strURL = "http://www.experts-exchange.com/Programming/Programming_Languages/Visual_Basic/askQuestion.jsp"

Msgbox ReturnBaseDomain(strURL)

How do I make the function ReturnBaseDomain to return only the  www.experts-exchange.com part of the url.



HepenAsked:
Who is Participating?
I wear a lot of hats...

"The solutions and answers provided on Experts Exchange have been extremely helpful to me over the last few years. I wear a lot of hats - Developer, Database Administrator, Help Desk, etc., so I know a lot of things but not a lot about one thing. Experts Exchange gives me answers from people who do know a lot about one thing, in a easy to use platform." -Todd S.

 
Patrick MatthewsCommented:
Hi Hepen,

Add this function to your project:

Function RegExpFind(LookIn As String, PatternStr As String, Optional Pos, _
    Optional MatchCase As Boolean = True)

    ' This function uses Regular Expressions to parse a string (LookIn), and return matches to a
    ' pattern (PatternStr).  Use Pos to indicate which match you want:
    ' Pos omitted               : function returns a zero-based array of all matches
    ' Pos = 0                   : the last match
    ' Pos = 1                   : the first match
    ' Pos = 2                   : the second match
    ' Pos = <positive integer>  : the Nth match
    ' If Pos is greater than the number of matches, is negative, or is non-numeric, the function
    ' returns an empty string.  If no match is found, the function returns an empty string
   
    ' If MatchCase is omitted or True (default for RegExp) then the Pattern must match case (and
    ' thus you may have to use [a-zA-Z] instead of just [a-z] or [A-Z]).
   
    ' If you use this function in Excel, you can use range references for any of the arguments.
    ' If you use this in Excel and return the full array, make sure to set up the formula as an
    ' array formula.  If you need the array formula to go down a column, use TRANSPOSE()
   
    Dim RegX As Object
    Dim TheMatches As Object
    Dim Answer() As String
    Dim Counter As Long
   
    ' Evaluate Pos.  If it is there, it must be numeric and converted to Long
    If Not IsMissing(Pos) Then
        If Not IsNumeric(Pos) Then
            RegExpFind = ""
            Exit Function
        Else
            Pos = CLng(Pos)
        End If
    End If
   
    ' Create instance of RegExp object
    Set RegX = CreateObject("VBScript.RegExp")
    With RegX
        .Pattern = PatternStr
        .Global = True
        .IgnoreCase = Not MatchCase
    End With
       
    ' Test to see if there are any matches
    If RegX.test(LookIn) Then
       
        ' Run RegExp to get the matches, which are returned as a zero-based collection
        Set TheMatches = RegX.Execute(LookIn)
       
        ' If Pos is missing, user wants array of all matches.  Build it and assign it as the
        ' function's return value
        If IsMissing(Pos) Then
            ReDim Answer(0 To TheMatches.Count - 1) As String
            For Counter = 0 To UBound(Answer)
                Answer(Counter) = TheMatches(Counter)
            Next
            RegExpFind = Answer
       
        ' User wanted the Nth match (or last match, if Pos = 0).  Get the Nth value, if possible
        Else
            Select Case Pos
                Case 0                          ' Last match
                    RegExpFind = TheMatches(TheMatches.Count - 1)
                Case 1 To TheMatches.Count      ' Nth match
                    RegExpFind = TheMatches(Pos - 1)
                Case Else                       ' Invalid item number
                    RegExpFind = ""
            End Select
        End If
   
    ' If there are no matches, return empty string
    Else
        RegExpFind = ""
    End If
   
    ' Release object variables
    Set RegX = Nothing
    Set TheMatches = Nothing
   
End Function



Then use an expression like this:

SomeVariable = Replace(RegExpFind(strURL, "//[^/]*/", 1), "/", "")

Regards,

Patrick
0
 
nffvrxqgrcfqvvcCommented:
Option Explicit

Public Function CrackUrl(URL As String) As String
    CrackUrl = Left$(URL, InStr(8, URL, "/"))
End Function

Private Sub Form_Load()
    MsgBox CrackUrl("http://www.experts-exchange.com/Programming/Programming_Languages/Visual_Basic/askQuestion.jsp")
End Sub
0
 
HepenAuthor Commented:
Is there a way to make it so it doesn't return http:// or /

just www.experts-exchange.com
0
Ultimate Tool Kit for Technology Solution Provider

Broken down into practical pointers and step-by-step instructions, the IT Service Excellence Tool Kit delivers expert advice for technology solution providers. Get your free copy now.

 
Patrick MatthewsCommented:
Hepen,
> Is there a way to make it so it doesn't return http:// or /

My post does that...

Regards,

Patrick
0
 
HepenAuthor Commented:
Only problem is i have to use regex, i was hoping for a string manipulated way. Although I will use your code for another project for my server application.
0
 
nffvrxqgrcfqvvcCommented:
If you want to use API than there is another way
0
 
HepenAuthor Commented:
what is the api way
0
 
[ fanpages ]IT Services ConsultantCommented:
How about this?...

Option Explicit
Public Function strGet_Domain_NameHTMLCode(ByVal strURL As String) As String

  Dim objInternetExplorer_Application                   As Object
  Dim strReturn                                         As String
 
  On Error GoTo Err_strGet_Domain_Name
 
  strReturn = ""
 
  Set objInternetExplorer_Application = CreateObject("InternetExplorer.Application")
 
  Call objInternetExplorer_Application.Navigate(strURL)
 
  While (objInternetExplorer_Application.Busy)
      DoEvents
  Wend
 
  strReturn = objInternetExplorer_Application.Document.domain
 
Exit_strGet_Domain_Name:

  On Error Resume Next
 
  If Not (objInternetExplorer_Application Is Nothing) Then
     objInternetExplorer_Application.Quit
     Set objInternetExplorer_Application = Nothing
  End If
 
  strGet_Domain_NameHTMLCode = strReturn
 
  Exit Function
 
Err_strGet_Domain_Name:

  On Error Resume Next
 
  strReturn = ""
 
  Resume Exit_strGet_Domain_Name
 
End Function
Public Sub Test()

  MsgBox strGet_Domain_NameHTMLCode("http://www.experts-exchange.com/Programming/Programming_Languages/Visual_Basic/askQuestion.jsp"), _
         vbInformation Or vbOKOnly
         
End Sub



BFN,

fp.
0
 
[ fanpages ]IT Services ConsultantCommented:
Did you mean the "UrlGetPartA"/"UrlGetPart" API?

[ http://vbnet.mvps.org/index.html?code/internet/urlgetpart.htm ]

BFN,

fp.
0
 
nffvrxqgrcfqvvcCommented:
Well my first thought was InternetCrackUrl API
0
 
Patrick MatthewsCommented:
Hepen,
> Only problem is i have to use regex, i was hoping for a string manipulated way.

What's wrong with using RegEx?  BTW, I found a pattern string that does away with the Replace:

SomeVariable = RegExpFind(strURL, "[a-z0-9-]{2,}\.[a-z0-9-]{2,}(\.[a-z0-9-]{2,}){0,}", 1, False)

That will work regardless of whether there are any forward slashes in the URL.

Regards,

Patrick
0
 
[ fanpages ]IT Services ConsultantCommented:
For example...

Option Explicit

Private Declare Function UrlGetPart _
                     Lib "shlwapi.dll" _
                   Alias "UrlGetPartA" _
                  (ByVal pszIn As String, _
                   ByVal pszOut As String, _
                   ByRef pcchOut As Long, _
                   ByVal dwPart As Long, _
                   ByVal dwFlags As Long) As Long
Private Function strGet_URL_Part(ByVal strURL As String, _
                                 ByVal lngPart As Long) As String

  Dim lngSize                                           As Long
  Dim strPart                                           As String
  Dim strReturn                                         As String
 
  On Error GoTo Err_strGet_URL_Part
 
  Const lngMAX_PATH                                     As Long = 260&
  Const lngURL_PARTFLAG_KEEPSCHEME                      As Long = &H1

  strReturn = ""
 
  If Len(strURL) > 0 Then
     strPart = Space$(lngMAX_PATH)
     lngSize = Len(strPart)
     
     If UrlGetPart(strURL, _
                   strPart, _
                   lngSize, _
                   lngPart, _
                   lngURL_PARTFLAG_KEEPSCHEME) = 0& Then
        strReturn = Left$(strPart, lngSize)
     End If
  End If

Exit_strGet_URL_Part:

  On Error Resume Next
 
  strGet_URL_Part = strReturn
 
Err_strGet_URL_Part:

  On Error Resume Next
 
  strReturn = ""
 
  Resume Exit_strGet_URL_Part

End Function
Public Sub Test_GetURLPart()

  Dim strURL                                            As String
 
  On Error Resume Next
 
' Const lngURL_PART_SCHEME                              As Long = 1&
  Const lngURL_PART_HOSTNAME                            As Long = 2&
' Const lngURL_PART_USERNAME                            As Long = 3&
' Const lngURL_PART_PASSWORD                            As Long = 4&
' Const lngURL_PART_PORT                                As Long = 5&
' Const lngURL_PART_QUERY                               As Long = 6&
 
  strURL = "http://www.experts-exchange.com/Programming/Programming_Languages/Visual_Basic/askQuestion.jsp"
 
  MsgBox Replace(strGet_URL_Part(strURL, lngURL_PART_HOSTNAME), Left$(strURL & "/", InStr(strURL, "/") - 1), ""), _
         vbInformation Or vbOKOnly
 
End Sub



BFN,

fp.
0
 
[ fanpages ]IT Services ConsultantCommented:
PS. egl1044's API suggestion:

[ http://msdn.microsoft.com/library/en-us/wininet/wininet/internetcrackurl.asp ]

And sample class file showing code for usage of...
InternetCanonicalizeUrl
InternetCrackUrl
InternetCreateUrl

[ http://vb.mvps.org/samples/project.asp?id=NetUrl ]

' *************************************************************************
'  Copyright ©1998-2000 Karl E. Peterson
'  All Rights Reserved, http://www.mvps.org/vb
' *************************************************************************
'  You are free to use this code within your own applications, but you
'  are expressly forbidden from selling or otherwise distributing this
'  source code, non-compiled, without prior written consent.
' *************************************************************************
Option Explicit

'=====================================================
'=========== INTERNet URL DECLARATIONS ===============
'=====================================================
Private Declare Function InternetCrackUrl Lib "wininet.dll" Alias "InternetCrackUrlA" (ByVal lpszUrl As String, ByVal dwUrlLength As Long, ByVal dwFlags As Long, lpUrlComponents As URL_COMPONENTS) As Long
Private Declare Function InternetCanonicalizeUrl Lib "wininet.dll" Alias "InternetCanonicalizeUrlA" (ByVal lpszUrl As String, ByVal lpszBuffer As String, lpdwBufferLength As Long, ByVal dwFlags As Long) As Long
Private Declare Function InternetCreateUrl Lib "wininet.dll" Alias "InternetCreateUrlA" (lpUrlComponents As URL_COMPONENTS, ByVal dwFlags As Long, ByVal lpszUrl As String, lpdwUrlLength As Long) As Long

Private Type URL_COMPONENTS       'typedef struct {
   StructSize As Long            '    DWORD dwStructSize;
   Scheme As String              '    LPSTR lpszScheme;
   SchemeLength As Long          '    DWORD dwSchemeLength;
   nScheme As Long               '    INTERNET_SCHEME nScheme;
   HostName As String            '    LPSTR lpszHostName;
   HostNameLength As Long        '    DWORD dwHostNameLength;
   nPort As Long                 '    INTERNET_PORT nPort;
   UserName As String            '    LPSTR lpszUserName;
   UserNameLength As Long        '    DWORD dwUserNameLength;
   Password As String            '    LPSTR lpszPassword;
   PasswordLength As Long        '    DWORD dwPasswordLength;
   URLPath As String             '    LPSTR lpszUrlPath;
   UrlPathLength As Long         '    DWORD dwUrlPathLength;
   ExtraInfo As String           '    LPSTR lpszExtraInfo;
   ExtraInfoLength As Long       '    DWORD dwExtraInfoLength;
End Type                         '} URL_COMPONENTS;
'
' Enumerated internet schemes
'
Public Enum InetSchemes
   InternetSchemePartial = -2
   InternetSchemeUnknown = -1
   InternetSchemeDefault = 0
   InternetSchemeFtp
   InternetSchemeGopher
   InternetSchemeHttp
   InternetSchemeHttps
   InternetSchemeFile
   InternetSchemeNews
   InternetSchemeMailto
   InternetSchemeSocks
   InternetSchemeFirst = InternetSchemeFtp
   InternetSchemeLast = InternetSchemeSocks
End Enum

' Flags for InternetCrackUrl() and InternetCreateUrl()
Private Const ICU_ESCAPE = &H80000000      '// (un)escape URL characters
Private Const ICU_USERNAME = &H40000000    '// use internal username &amp; password

' Flags for InternetCanonicalizeUrl() and InternetCombineUrl()
Private Const ICU_NO_ENCODE = &H20000000   '// Don't convert unsafe characters to escape sequence
Private Const ICU_DECODE = &H10000000      '// Convert %XX escape sequences to characters
Private Const ICU_NO_META = &H8000000      '// Don't convert .. etc. meta path sequences
Private Const ICU_ENCODE_SPACES_ONLY = &H4000000  '// Encode spaces only
Private Const ICU_BROWSER_MODE = &H2000000 '// Special encode/decode rules for browser

' Possible error codes
Private Const ERROR_INSUFFICIENT_BUFFER As Long = 122

' Set aside storage for private member variables.
Private m_Url As String
Private m_UrlCmp As URL_COMPONENTS
Private m_UrlCmpMT As URL_COMPONENTS
Private m_LastDllError As Long

' ********************************************
'  Initialize
' ********************************************
Private Sub Class_Initialize()
   ' set size into "empty" structure
   m_UrlCmpMT.StructSize = Len(m_UrlCmpMT)
End Sub

' ********************************************
'  Public Properties
' ********************************************
Public Property Let ExtraInfo(ByVal NewVal As String)
   m_UrlCmp.ExtraInfo = NewVal
   m_UrlCmp.ExtraInfoLength = Len(NewVal)
End Property

Public Property Get ExtraInfo() As String
   ExtraInfo = m_UrlCmp.ExtraInfo
End Property

Public Property Let HostName(ByVal NewVal As String)
   m_UrlCmp.HostName = NewVal
   m_UrlCmp.HostNameLength = Len(NewVal)
End Property

Public Property Get HostName() As String
   HostName = m_UrlCmp.HostName
End Property

Public Property Let Password(ByVal NewVal As String)
   m_UrlCmp.Password = NewVal
   m_UrlCmp.PasswordLength = Len(NewVal)
End Property

Public Property Get Password() As String
   Password = m_UrlCmp.Password
End Property

Public Property Let Port(ByVal NewVal As Long)
   m_UrlCmp.nPort = NewVal
End Property

Public Property Get Port() As Long
   Port = m_UrlCmp.nPort
End Property

Public Property Let Scheme(ByVal NewVal As InetSchemes)
   m_UrlCmp.nScheme = NewVal
End Property

Public Property Get Scheme() As InetSchemes
   Scheme = m_UrlCmp.nScheme
End Property

Public Property Let URLPath(ByVal NewVal As String)
   m_UrlCmp.URLPath = NewVal
   m_UrlCmp.UrlPathLength = Len(NewVal)
End Property

Public Property Get URLPath() As String
   URLPath = m_UrlCmp.URLPath
End Property

Public Property Let UserName(ByVal NewVal As String)
   m_UrlCmp.UserName = NewVal
   m_UrlCmp.UserNameLength = Len(NewVal)
End Property

Public Property Get UserName() As String
   UserName = m_UrlCmp.UserName
End Property

Public Property Let Value(ByVal NewVal As String)
   m_Url = NewVal
   m_UrlCmp = m_UrlCmpMT
   m_LastDllError = 0
   Call CrackUrl(m_Url, m_UrlCmp)
End Property

Public Property Get Value() As String
Attribute Value.VB_UserMemId = 0
   m_LastDllError = 0
   Value = CreateUrl(m_UrlCmp)
End Property

' ********************************************
'  Public Properties // Read-Only
' ********************************************
Public Property Get LastDllError() As Long
   LastDllError = m_LastDllError
End Property
   
Public Property Get SchemeName() As String
   SchemeName = m_UrlCmp.Scheme
End Property

' ********************************************
'  Private Methods
' ********************************************
Private Function CreateUrl(cmp As URL_COMPONENTS) As String
   Dim Buffer As String
   Dim BufLen As Long
   
   ' prepare buffers
   Buffer = String$(1024, 0)
   BufLen = Len(Buffer)
   cmp.StructSize = Len(cmp)
   
   ' try to put it all together
   If InternetCreateUrl(cmp, ICU_ESCAPE, Buffer, BufLen) Then
      CreateUrl = TrimNull(Buffer)
   Else
      If Err.LastDllError = ERROR_INSUFFICIENT_BUFFER Then
         ' bump buffer and try again
         Buffer = String$(BufLen, 0)
         If InternetCreateUrl(cmp, ICU_ESCAPE, Buffer, BufLen) Then
            CreateUrl = TrimNull(Buffer)
         End If
      Else
         m_LastDllError = Err.LastDllError
         Debug.Print "InternetCreateUrl error: " & m_LastDllError
      End If
   End If
End Function

Private Function CrackUrl(ByVal URL As String, cmp As URL_COMPONENTS) As Boolean
   Dim Buffer As String
   Dim BufLen As Long
   Const BufSize = 1024
   '
   ' Try to insure a valid URL to begin with
   '
   Buffer = Space$(BufSize)
   BufLen = Len(Buffer)
   If InternetCanonicalizeUrl(URL, Buffer, BufLen, ICU_BROWSER_MODE) Then
      URL = Left(Buffer, BufLen)
      '
      ' Reset structure and supply buffers
      '
      With cmp
         .StructSize = Len(cmp)
         .Scheme = Space$(BufSize)
         .SchemeLength = BufSize
         .nScheme = InternetSchemeUnknown
         .HostName = Space$(BufSize)
         .HostNameLength = BufSize
         .nPort = 0
         .UserName = Space$(BufSize)
         .UserNameLength = BufSize
         .Password = Space$(BufSize)
         .PasswordLength = BufSize
         .URLPath = Space$(BufSize)
         .UrlPathLength = BufSize
         .ExtraInfo = Space$(BufSize)
         .ExtraInfoLength = BufSize
      End With
      '
      ' Crack URL apart and get what we can from it.
      ' Note: API requires Len(URL) param to include trailing null!
      '
      CrackUrl = CBool(InternetCrackUrl(URL, Len(URL) + 1, ICU_ESCAPE, cmp))
      '
      ' Clean up structure to get rid of crapola
      '
      With cmp
         .Scheme = Left$(.Scheme, .SchemeLength)
         .HostName = Left$(.HostName, .HostNameLength)
         .UserName = Left$(.UserName, .UserNameLength)
         .Password = Left$(.Password, .PasswordLength)
         .URLPath = Left$(.URLPath, .UrlPathLength)
         .ExtraInfo = Left$(.ExtraInfo, .ExtraInfoLength)
      End With
   Else
      CrackUrl = False
   End If
End Function

Private Function TrimNull(ByVal StrIn As String) As String
   Dim nul As Long
   '
   ' Truncate input string at first null.
   ' If no nulls, perform ordinary Trim.
   '
   nul = InStr(StrIn, vbNullChar)
   Select Case nul
      Case Is > 1
         TrimNull = Left(StrIn, nul - 1)
      Case 1
         TrimNull = ""
      Case 0
         TrimNull = Trim(StrIn)
   End Select
End Function


BFN,

fp.
0
 
Patrick MatthewsCommented:
Nigel,

Would you be so kind as to see if I am missing anything in my RegExp pattern string?  It is trying to match:

[two or more characters being any number, letter, or hyphen] + [period] + [two or more characters being any number, letter, or hyphen] +
[a period followed by two or more characters being any number, letter, or hyphen, this entire block repeated zero or more times]

That sounds like a valid answer to me, without getting *really* complicated (I know, for example, that this does not fit the strictest rules
for the top-level domains...).

Regards,

Patrick
0
 
nffvrxqgrcfqvvcCommented:
Heres the example I came up with


Option Explicit

    Private Type URL_COMPONENTS
       StructSize           As Long
       Scheme               As String
       SchemeLength         As Long
       nScheme              As Long
       HostName             As String
       HostNameLength       As Long
       nPort                As Long
       UserName             As String
       UserNameLength       As Long
       Password             As String
       PasswordLength       As Long
       URLPath              As String
       UrlPathLength        As Long
       ExtraInfo            As String
       ExtraInfoLength      As Long
    End Type
   
    Private Declare Function InternetCrackUrlA Lib "wininet.dll" ( _
        ByVal lpszUrl As String, _
        ByVal dwUrlLength As Long, _
        ByVal dwFlags As Long, _
        lpUrlComponents As URL_COMPONENTS) As Long
   
    Dim uCmp As URL_COMPONENTS
   
Public Function CrackUrl(ByVal url As String) As String
    With uCmp
        .StructSize = Len(uCmp)
        .HostName = Space$(1024)
        .HostNameLength = 1024
        InternetCrackUrlA url, Len(url), &H80000000, uCmp
        CrackUrl = Left$(.HostName, InStr(1, .HostName, Chr$(0)) - 1)
    End With
End Function

Private Sub Command1_Click()
    MsgBox CrackUrl("http://www.google.com/Programming/")
End Sub
0
 
nffvrxqgrcfqvvcCommented:
Who's Nigel
0
 
Patrick MatthewsCommented:
Hepen,

Will your text *always* start with "http://"?  My RegExp-enabled function will cope with that eventuality just fine,
but I am not sure the API-based methods will...

(Meaning no disrespect at all toward Nigel or egl1044, of course!)

Regards,

Patrick
0
 
Patrick MatthewsCommented:
egl1044,
> Who's Nigel

fanpages = Nigel

:)

Regards,

Patrick
0
 
nffvrxqgrcfqvvcCommented:
haha ok, well author has 3 ways to do this now so I think he got his answere.
0
 
Patrick MatthewsCommented:
egl1044,
> haha ok, well author has 3 ways to do this now so I think he got his answere.

LOL!

Regards,

Patrick
0
 
justchat_1Commented:
as far as no "http://" in the string:
string=replace(string, "http://","")
0
 
Mark_FreeSoftwareCommented:
why always go for the api or the difficult way?


Private Function TrimURL(ByVal str As String) As String
Dim n As Long
   n = InStr(str, "//")
   If n > 0 Then str = Right$(str, Len(str) - n - 1)
   n = InStr(str, "/")
   If n > 0 Then str = Left$(str, n - 1)
End Function

0

Experts Exchange Solution brought to you by ConnectWise

Your issues matter to us.

Facing a tech roadblock? Get the help and guidance you need from experienced professionals who care. Ask your question anytime, anywhere, with no hassle.

Start your 7-day free trial
 
Patrick MatthewsCommented:
Mark,

> why always go for the api or the difficult way?

Your method will work just fine, but *only* if the string we want is *always* bounded by "//" on the left and "/" on
the right.  The Asker gave us one example, not a rule that it would always follow that example.

Regards,

Patrick
0
 
zzzzzoocCommented:
Why not just Split()? I don't see any obvious faults with that concept, unless user&pass/port are tagged to the syntax.

Option Explicit
Private Sub Form_Load()
    Call MsgBox(ReturnBaseDomain("http://www.experts-exchange.com/Programming/askQuestion.jsp"))
    Call MsgBox(ReturnBaseDomain("ftp://www.experts-exchange.com/Programming/askQuestion.jsp"))
    Call MsgBox(ReturnBaseDomain("www.experts-exchange.com/askQuestion.jsp"))
    Call MsgBox(ReturnBaseDomain("www.experts-exchange.com"))
    Call MsgBox(ReturnBaseDomain("www.experts-exchange.com/"))
End Sub
Private Function ReturnBaseDomain(ByRef url As String) As String
    Dim s() As String
    s = Split(url, "/")
    If (Right$(s(0), 1) = ":") Then
        ReturnBaseDomain = s(2)
    Else
        ReturnBaseDomain = s(0)
    End If
End Function
0
 
[ fanpages ]IT Services ConsultantCommented:
The "Split" approach should work just fine as long as you could detect for the presence of "/" or "\" (depending on how the URL was presented/specified in the originating environment; e.g. DOS/Windows or UNIX based).

PS. With regards stripping the optional prefix of "Scheme" ("http://" "ftp://" "https://" "file://" "telnet://" etc)

Note the use of the Replace(...) function in my code demonstrating the strGet_URL_Part(...) method:

MsgBox Replace(strGet_URL_Part(strURL, lngURL_PART_HOSTNAME), Left$(strURL & "/", InStr(strURL, "/") - 1), ""), _
         vbInformation Or vbOKOnly

PPS. Patrick...

"Would you be so kind as to see if I am missing anything in my RegExp pattern string?  It is trying to match:

[two or more characters being any number, letter, or hyphen] + [period] + [two or more characters being any number, letter, or hyphen] +
[a period followed by two or more characters being any number, letter, or hyphen, this entire block repeated zero or more times]

That sounds like a valid answer to me, without getting *really* complicated (I know, for example, that this does not fit the strictest rules for the top-level domains...)."


I have seen a huge 'regexp' string for e-mail validation in my surfin' travels, & I'm sure there was a similar one for URL parsing... I'll see if I can find it again.

BFN,

fp.
0
 
[ fanpages ]IT Services ConsultantCommented:
PPPS. [!] Mark_FreeSoftware:

Private Function TrimURL(ByVal str As String) As String
Dim n As Long
   n = InStr(str, "//")
   If n > 0 Then str = Right$(str, Len(str) - n - 1)
   n = InStr(str, "/")
   If n > 0 Then str = Left$(str, n - 1)

  TrimURL = str ' *** ADDED :)

End Function


BFN,

fp.
0
 
[ fanpages ]IT Services ConsultantCommented:
Hi again Patrick,

Here's a few 'regexp' examples for URL parsing:

[ http://textsnippets.com/posts/show/523 ]
"^((http|https):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+\.[^#?\s]+)(.*?)(#[\w\-]+)?$"

(Obviously need to add more "protocol scheme" prefixes to that, e.g. "mailto", "ftp", "ftps", "file", "telnet", "ldap", "news", "gopher", "data", "finger", "local", "local-cgi", "about", and any more I've missed!)

(More searching...)
Comprehensive list here:
[ http://www.iana.org/assignments/uri-schemes.html ]

Further reading here:
[ http://esw.w3.org/topic/UriSchemes ]
[ http://www.faqs.org/rfcs/rfc1738.html ]


PS. Anybody good with converting Python scripts to VB(A)?
[ http://docs.python.org/lib/module-urlparse.html ]
and
[ http://mail.python.org/pipermail/doc-sig/2001-March/001456.html ]
r'\b((?:http|ftp|https|mailto)://[\w@&#-_.!~*();]+\b/?)'


BFN,

fp.
0
 
[ fanpages ]IT Services ConsultantCommented:
Thanks for asking & subsequently closing the question, Hepen.

Hopefully all the options should prove to be a valuable resource for future question askers/Experts alike :)

BFN,

fp.
0
 
Mark_FreeSoftwareCommented:

lol fanpages, that's like a quadropost ;)

matthewspatrick,

> Your method will work just fine, but *only* if the string we want is *always* bounded by "//" on the left and "/" on
> the right.  The Asker gave us one example, not a rule that it would always follow that example.


no, it works fine without
try it !

(i did, with these urls:
http://www.test.com
http://www.test.com/test
www.test.com
www.test.com/test)

and they all worked
0
 
[ fanpages ]IT Services ConsultantCommented:
Hi Mark,

Thanks... erm, I think.  "quadropost"?  A phrase I'm not familiar with :(

Is that German or another great divide of the language used by both UK & US cousins?

BFN,

fp.
0
 
Mark_FreeSoftwareCommented:

hehe it's not german, it's....
err i don't know


quadro is italian i think it means 4


and no i'm not an italian ;)
0
 
[ fanpages ]IT Services ConsultantCommented:
PS. The e-mail validation "regexp" I was referring to:

In a word, OUCH!

[ http://regular-expressions.com/email.html ]

(?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:
(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.
|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t]
)*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=
[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(
?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \
x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\
[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])
*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[
\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*
))*|(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(
?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*
"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".
\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\
]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?
[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\
Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[
 \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F
]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]
\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()
<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@
,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?
:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\
n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?
:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[
^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()
<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\
r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x
1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[
\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^
()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<
>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(
?:(?:\r\n)?[ \t])*)|(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r
\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(
?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*:(?:(?:\r\n)?[ \t])*(?:(?:
(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\
["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:
(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \
x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"
(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:
(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?
[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(
?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x
00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[
([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\]
 \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))
|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(
?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:
\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*
\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\
[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]
]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)
?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|
\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?
[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]
+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\
r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^(
)<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>
@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\
n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x
1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\
"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\
n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])
+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n
)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1
F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\
]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)(?:,\s
*(?:(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(
?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*
"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[
\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]
))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*
@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r
\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\]
(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\
] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]])
)|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".
\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\
]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)
*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?
:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\
\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\
\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".
\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\
r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t
])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r
\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\
x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\
[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?
:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["
()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?
:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x0
0-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?
:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?
:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00-\x1F]+(?:(?:(?:\r\n)?[
\t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:
\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \x00
-\x1F]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([
^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*))*
)?;\s*)

See also:
[ http://sleeksoft.co.uk/public/techblog/articles/20050121_3.html ]
(A blog/site owned by a previous colleague of mine)

BFN,

fp.
0
 
[ fanpages ]IT Services ConsultantCommented:
Well, yes, "quad" is Latin for 'four'.

I just didn't understand what you meant by "quadropost".
0
 
Mark_FreeSoftwareCommented:

whoever wrote that,

he has too much free time :P
0
 
[ fanpages ]IT Services ConsultantCommented:
[And whomever just read it...? ;)]
0
 
[ fanpages ]IT Services ConsultantCommented:
More to digest, Patrick:

[ http://www.cambiaresearch.com/cambia3/snippets/csharp/regex/uri_regex.aspx ]

regexPattern = @"^(?<s1>(?<s0>[^:/\?#]+):)?(?<a1>"
      + @"//(?<a0>[^/\?#]*))?(?<p0>[^\?#]*)"
      + @"(?<q1>\?(?<q0>[^#]*))?"
      + @"(?<f1>#(?<f0>.*))?";
0
 
[ fanpages ]IT Services ConsultantCommented:
And this may have some relevance - note the comment about URLs without domains; those that are relative!

[ http://www.cflib.org/udf.cfm?ID=494 ]

===
<cfscript>
/**
 * Returns the host from a specified URL.
 * RE fix for MX, thanks to Tom Lane
 *
 * @param this_url        URL to parse. (Required)
 * @return Returns a string.
 * @author Shawn Seley (shawnse@aol.com)
 * @version 2, August 23, 2002
 */
function GetHostFromURL(this_url) {
      var first_char       = "";
      var re_found_struct  = "";
      var num_expressions  = 0;
      var num_dots         = 0;
      var this_host        = "";
      
      this_url = trim(this_url);
      
      first_char = Left(this_url, 1);
      if (Find(first_char, "./")) {
            return "";   // relative URL = no host   (ex: "../dir1/filename.html" or "/dir1/filename.html")
      } else if(Find("://", this_url)){
            // absolute URL    (ex: "pass@ftp.host.com">ftp://user:pass@ftp.host.com")
            re_found_struct = REFind("[^@]*@([^/:\?##]+)|([^/:\?##]+)", this_url, Find("://", this_url)+3, "True");
      } else {
            // abbreviated URL (ex: "user:pass@ftp.host.com")
            re_found_struct = REFind("[^@]*@([^/:\?##]+)|([^/:\?##]+)", this_url, 1, "True");
      }
      
      if (re_found_struct.pos[1] GT 0) {
            num_expressions = ArrayLen(re_found_struct.pos);
                if(re_found_struct.pos[num_expressions] is 0) num_expressions = num_expressions - 1;
            this_host = Mid(this_url, re_found_struct.pos[num_expressions], re_found_struct.len[num_expressions]);
            num_dots = (Len(this_host) - Len(Replace(this_host, ".", "", "ALL")));;
            if ((not FindOneOf("/@:", this_url)) and (num_dots LT 2)){
                  // since this URL doesn't contain any "/" or "@" or ":" characters and since the "host" has fewer than two dots (".")
                  // then it is probably actually a file name
                  return "";
            }
            return this_host;
      } else {
            return "";
      }
}
</cfscript>
===


Also note the originating page where I found a link to the above code:

[ http://www.houseoffusion.com/cf_lists/message.cfm/forumid:4/messageid:126884 ]

"I found the GetHostFromURL() function on CFLib. The function seems to
work pretty well, it even strips port numbers etc. The problem with that
function is that it will strip anything before the first "." if there is
anything. So, http://www.something.com is returned as something.com .
However I need the hostname as well. I would modify the existing
function, however I am not too keen on RegEx atm.

...chris.alvarado
[ application developer ]
4 Guys Interactive, Inc.
..."
0
 
Patrick MatthewsCommented:
Nigel,

re: excruciatingly long URL and email address parsing pattern strings

I guess I just re-learned the meaning of "be careful of what you ask for".

Double :)

Patrick
0
 
[ fanpages ]IT Services ConsultantCommented:
I never leave any turn unstoned.
0
Question has a verified solution.

Are you are experiencing a similar issue? Get a personalized answer when you ask a related question.

Have a better answer? Share it in a comment.

All Courses

From novice to tech pro — start learning today.