============== RESTART: /Users/rjw/Documents/Python/getLinks.py ==============
<_sre.SRE_Match object at 0x1021a0f30>
# http://www.theherbsplacenews.com/
import urllib2
import re
#connect to a URL
website = urllib2.urlopen('http://www.theherbsplacenews.com/')
#read html code
html = website.read()
#use re.findall to get all the links
# links = re.findall('"(http://www.theherbsplace.com/.*?)"', html)
prog = re.compile('"http://www\.theherbsplace\.com/(.*?)?"')
result = prog.match('"http://www.theherbsplace.com/\1" rel="nofollow"')
print result
#!python2
# http://www.theherbsplacenews.com/
import urllib2
import re
#connect to a URL
website = urllib2.urlopen('http://www.theherbsplacenews.com/')
#read html code
html = website.read()
rex = re.compile(r'"http://www\.theherbsplace\.com/(.*?)?"')
#use re.findall to get all the links
links = rex.findall(html)
with open('links.txt', 'w') as f:
for link in links:
f.write(link + '\n')
As Terry wrote, you shoul (probably always) use the r'raw string literals' for regular expression patterns. Otherwise, you would have to double all backslashes that should not be interpreted as a being the starter of a string escape sequence. Regular expressions also use backslash for special purpose , and the two usages together make it confusing. This is the reasons why languages with syntactically built-in regular expressions use a form of writing the patterns different from normal string literals (say enclosed in slashes).#!python2
import re
rex = re.compile(r'"http://www\.theherbsplace\.com/(.*?)?"')
m = rex.match('"http://www.theherbsplace.com/1" rel="nofollow"')
if m:
print m.group(0)
print m.group(1)
I am not sure what you mean by \1 in your example, but you probably know now what you want to do ;) Otherwise, feel free to ask.
#!python2
# http://www.theherbsplacenews.com/
import urllib2
import re
#connect to a URL
website = urllib2.urlopen('http://www.theherbsplacenews.com/')
#read html code
html = website.read()
rex = re.compile(r'"http://www\.theherbsplace\.com/(.*?)?"')
#use re.findall to get all the links
links = rex.findall(html)
with open('links.txt', 'w') as f:
for link in links:
f.write(link + '\n')
#rex = re.compile(r'"http://www\.theherbsplace\.com/(.*?)?"')
m = rex.match('"http://www.theherbsplace.com///\1" rel="nofollow"')
if m:
print m.group(0)
print m.group(1)
# http://www.theherbsplacenews.com/
import urllib2
import re
#connect to a URL
website = urllib2.urlopen('http://www.theherbsplacenews.com/')
#read html code
html = website.read()
#use re.findall to get all the links
# links = re.findall('"(http://www.theherbsplace.com/.*?)"', html)
#prog = re.compile('"http://www\.theherbsplace\.com/(.*?)?"')
#result = prog.match('"http://www.theherbsplace.com/\1" rel="nofollow"')
result = re.sub(r'"http://www\.theherbsplace\.com/(.*?)?"', r'"http://www.theherbsplace.com/\1" rel="nofollow"', html)
print result
with open('new_document.html', 'w') as f:
f.write(result)
and then look inside the generated file. Similarly, you can save the website content to the original_document.html, and then you can use the tool of your choice for comparing (diff) the two files (see http://alternativeto.net/software/winmerge/?platform=mac).
# http://www.theherbsplacenews.com/
import urllib2
import re
#connect to a URL
website = urllib2.urlopen('http://www.theherbsplacenews.com/')
#read html code
html = website.read()
#use re.findall to get all the links
# links = re.findall('"(http://www.theherbsplace.com/.*?)"', html)
pattern = re.compile('"http://www\.theherbsplace\.com/(.*?)?"')
#result = prog.match(r'"http://www.theherbsplace.com/\1" rel="nofollow"')
#result = re.sub(r'"http://www\.theherbsplace\.com/(.*?)?"', r'"http://www.theherbsplace.com/\1" rel="nofollow"', html)
result = pattern.sub(r"http://www.theherbsplace.com/\1" rel="nofollow",
#print result
with open('new_document.html', 'w') as f:
f.write(result)
============== RESTART: /Users/rjw/Documents/Python/getLinks.py ==============
>>>
Open in new window
with the code:Open in new window