muraliram
asked on
Search Engine Implementation
How I can implement a serch engine in Java. What is the basis of a search ?
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
WOW. It's also a biggest mystery hunting mefrom long time
Are u going to implement search engine
or search option for your site .
Search engine is capable of searching content from more than one webserver.
where as search option only search the
content in your server only or archives
u specifies.
In any case u need to understand the
concept before implementing in any of the language.
1)Formatting the html file:
First you have to collect all the keyword in the html file and put it in the meta tag of that html page
2)Uploading to the database:
Read meta tag in each html file and
transfer to the database accoring to the
category it comes.
3)Providing Interface:
Porivide a user interface so that user
can enter the keyword,author name or title . After the user press submit , retreive the data from the database and
display in the format that most search
engine usefully follows.
If u need any more clarification, specify me the exact problem.
or search option for your site .
Search engine is capable of searching content from more than one webserver.
where as search option only search the
content in your server only or archives
u specifies.
In any case u need to understand the
concept before implementing in any of the language.
1)Formatting the html file:
First you have to collect all the keyword in the html file and put it in the meta tag of that html page
2)Uploading to the database:
Read meta tag in each html file and
transfer to the database accoring to the
category it comes.
3)Providing Interface:
Porivide a user interface so that user
can enter the keyword,author name or title . After the user press submit , retreive the data from the database and
display in the format that most search
engine usefully follows.
If u need any more clarification, specify me the exact problem.
ASKER
Thanks for your answer mbormann's answer.
Thanks bava anand for clarifying my idea on the implementation of a search engine.
Thanks bava anand for clarifying my idea on the implementation of a search engine.
A Java Applet Search Engine
by Tim Kientzle
Listing One
public class DBBTree {
protected RandomAccessFile file;
protected File fileName;
private int pageSize; // page size
private boolean msbFirst; // True = MSB byte order, False = LSB byte order
/** The constructor takes a File name or File object. */
public DBBTree(String name) throws IOException {
this(new File(name));
}
public DBBTree(java.io.File name) throws IOException {
fileName = name;
try {
file = new RandomAccessFile(name,"r")
} catch (SecurityException e) {
throw new IOException("Wrong permissions for "+name+": "+e);
} catch (IOException e) {
throw new IOException("file open failed: " + e);
}
/* Read the metadata. I only read the first three values. */
pageSize = 256; // Big enough to get all of the metadata
byte [] metaData = readRawPage(0); // Page 0 holds metadata
// Read magic number and determine endianness of DB file
msbFirst = true; // Try MSB format first...
int magic = bytesToInt(metaData,0);
if(magic != 0x053162) { // Failed? Try LSB...
msbFirst = false;
magic = bytesToInt(metaData,0);
System.out.println("Openin
} else
System.out.println("Openin
// Read version number and abort if not a DB file.
int version = bytesToInt(metaData,4);
if((magic != 0x053162) || (version != 3)) // BTREE Magic number & version
throw new IOException("Not a DB file (magic: 0x"
+ Integer.toHexString(magic)
+ Integer.toString(version) + ")");
// Set actual page size
pageSize = bytesToInt(metaData,8);
setCacheSize(8); // Set the cache to 8 pages as default
primeCache(); // Pre-read first 8 pages into cache
}
/** Search for a key. A 'key' here is a byte array. Returns the data
* associated with the key or null if the key wasn't found. */
public byte [] search(byte [] key) throws IOException {
return readPage(1).search(key); // Search always begins on page 1
}
/** Read a raw page, return an array of bytes containing that page. */
protected byte [] readRawPage(int pageno) throws IOException {
try{
byte data[] = new byte[pageSize];
file.seek(pageno*pageSize)
file.read(data);
return data;
} catch (IOException e) {
throw new IOException("readRawPage("
}
}
/** The cache is kept in LRU (least-recently-used) order; that is, whenever
* a page is accessed, it gets moved to the front of the list, and pages
* are lost when they fall off the end of the list. Happily, this code is
* simpler than the corresponding C version, thanks to the GC. */
int cachePageNumbers[];
byte cachePages[][];
/** Set the cache size. By default, it's set to 8 pages. */
public void setCacheSize(int size) {
int[] newCachePageNumbers = new int[size];
if(cachePageNumbers != null)
for(int i=0;i<size && i<cachePageNumbers.length;
newCachePageNumbers[i] = cachePageNumbers[i];
else
for(int i=0;i<size;i++)
newCachePageNumbers[i] = 0;
cachePageNumbers = newCachePageNumbers;
byte[][] newCachePages = new byte[size] [];
if(cachePages != null)
for(int i=0;i<size && i<cachePages.length;i++)
newCachePages[i] = cachePages[i];
cachePages = newCachePages;
}
/** Fill the cache with the first pages in the database. This can improve
* perceived performance noticably. In particular, this pulls in page 1,
* which is the top page of the B-Tree, and is searched on every lookup. */
public void primeCache() throws IOException {
int i = 1;
while(cachePageNumbers[cac
readPage(i);
i++;
}
}
/* Because I'm only caching a few pages, it's perfectly reasonable to use a
* simple array and just move elements down in the array to maintain LRU
* order. For a large cache, something more sophisticated would be
* appropriate. */
DBBTreePage readPage(int pageno) throws IOException {
// Try to find page in cache, return it if found
for(int i=0; i< cachePageNumbers.length; i++) {
if (cachePageNumbers[i] == pageno) {
byte page[] = cachePages[i];
if(i>0) {
// Move this page to front of cache
System.arraycopy(cachePage
System.arraycopy(cachePage
cachePageNumbers[0] = pageno;
cachePages[0] = page;
}
// Return found page
return new DBBTreePage(this,page);
}
}
// Page wasn't in cache, read it from disk
byte page[] = readRawPage(pageno);
// Insert new page at front of cache.
System.arraycopy(cachePage
cachePageNumbers,1,cachePa
System.arraycopy(cachePage
cachePageNumbers[0] = pageno;
cachePages[0] = page;
return new DBBTreePage(this,page);
}
/** Convert four bytes from a byte array into an integer, using LSB or MSB
* data order, as appropriate.
* @param barray Array of bytes
* @param offset Offset in barray to read integer
* @return integer formed from indicated bytes */
int bytesToInt(byte barray[],int offset) {
if(msbFirst)
return 16777216 * (((int)barray[offset ])&255) +
65536 * (((int)barray[offset+1])&2
256 * (((int)barray[offset+2])&2
(((int)barray[offset+3])&2
else
return (((int)barray[offset ])&255) +
256 * (((int)barray[offset+1])&2
65536 * (((int)barray[offset+2])&2
16777216 * (((int)barray[offset+3])&2
}
/** Convert two bytes from a byte array into a short.
* @param barray Array of bytes
* @param offset Offset in barray to read short
* @return short formed from indicated bytes */
short bytesToShort(byte barray[],int offset) {
if(msbFirst)
return (short)(256 * (((short)barray[offset ])&255) +
(((short)barray[offset+1])
else
return (short)( (((short)barray[offset ])&255) +
256 * (((short)barray[offset+1])
}
}
Listing Two
// There doesn't seem to be any one way to open a file that
// works on all platforms, so this function tries several...
DBBTree openDatabase(String filename) {
StringBuffer work;
DBBTree db;
// First, try seeing if there's such a thing as a 'current directory'
try {
db = new DBBTree(new File(filename));
System.out.println(" Opened " + filename);
return db;
} catch (IOException e) {
System.out.println(" Couldn't open " + filename);
System.out.println(" IOException: " + e);
}
// Manipulate the document base URL to build a more complete pathname.
try {
filename = new java.net.URL(getDocumentBa
} catch (java.net.MalformedURLExce
// If you can't build a more complete pathname, you're sunk
return null;
}
// Try filename part of file: URL
try {
db = new DBBTree(filename);
System.out.println(" Opened " + filename);
return db;
} catch (IOException e) {
System.out.println(" Couldn't open " + filename);
System.out.println(" IOException: " + e);
}
// Try changing the / to the native separator
// Note: some platforms lie about the native separator... <sigh>
work = new StringBuffer(filename);
try {
char separator = File.separatorChar;
System.out.println(" file.separator = " + separator);
for(int i=0;i<work.length();i++) // Change / to native separator
if(work.charAt(i) == '/') work.setCharAt(i,separator
db = new DBBTree(work.toString());
System.out.println(" Opened " + work);
return db;
} catch (IOException e) {
System.out.println(" Couldn't open " + work);
System.out.println(" IOException: " + e);
}
// Strip first character ...
filename = filename.substring(1);
try {
db = new DBBTree(filename);
System.out.println(" Opened " + filename);
return db;
} catch (IOException e) {
System.out.println(" Couldn't open " + filename);
System.out.println(" IOException: " + e);
}
// Everything failed... <sigh>
return null;
}