zhshqzyc
asked on
find expected substring among strings
Hi I want to find elements similarity among strings. The similarlity is defined at
here
The link will give you the complete story. I post there still no result.
Thanks for nice help.
here
The link will give you the complete story. I post there still no result.
Thanks for nice help.
Won't indexOf work for you.
ASKER
Did you read the link? No contains something.
How about this? There's a better explanation on the MS Forums site where you originally posted a question, this just adds a "ContainsSimilar" extension method which will return true if string 1 contains string 2, where at least n percent of the characters match.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
string str1 = @"TTTATGTCTATAATCCTTACCAAAAGTTACCTT"; //GGAATAAGAAGAAGTCAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGATACAGTAGGTGTCTCTATCTCTTCCTATAGTGCAGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCATGGTATTGAGCCTACACATGGTTGTGAAACAAGAATTGCTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACTTTTCTTTCACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAAAATTTACCTGCAACTTAAAAGTGATC";
string str2 = @"TGTCTATCCAAAAGTTACCTTGAATCCTTAGAA"; //TAAGAAGAAGTCATTTAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGGTGTCTCTATCTCTTCCTATAGTGCAAAGAATTGCAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCAATACAGTAGTGGTATTGAGCCTACACATGGTTGTGAAACTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAACTGCAACTTAAAAGTGATCCTTTTCTTTGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATTTAC";
string test = @"AGTAAAAAGAAGGCTGTTGT-------------"; //-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
string longestCommonSubstring = FindLongestCommonSubstring(test, str1, str2);
if (longestCommonSubstring != null)
Console.WriteLine("Found: {0}", longestCommonSubstring);
else
Console.WriteLine("Did not find ANY common substrings (should never happen)");
Console.ReadKey();
}
static string FindLongestCommonSubstring(params string[] stringsToCheck)
{
string longestPattern = String.Empty; ;
// Since we're looking for the longest possible common substring,
// start with the entire string as a pattern, then decrement the
// pattern by one until we find a match in all the strings
for (int patternLen = stringsToCheck[0].Length; patternLen > 0; patternLen--)
{
for (int patternStartPos = 0; patternStartPos + patternLen <= stringsToCheck[0].Length; patternStartPos++)
{
string pattern = stringsToCheck[0].Substring(patternStartPos, patternLen);
// If this pattern occurs in all the strings, it is
// the longest common substring/sub-pattern
if (stringsToCheck.All(x => x.ContainsSimilar(pattern, .8))) // If all the strings contain this pattern where at least 80% of the characters match
return pattern;
}
}
return null;
}
}
public static class Extensions
{
public static bool ContainsSimilar(this string source, string pattern, double similarity)
{
for (int matchStartPos = 0; matchStartPos <= source.Length - pattern.Length; matchStartPos++)
{
// try comparing the pattern to the source with both their starts lined up initially
// each iteration of the loop effectively "moves" the pattern one character further
// down source for another comparison
int matchingChars = 0;
int charsRequiredForMatch = (int)Math.Ceiling((double)pattern.Length * similarity);
for (int i = 0; i < pattern.Length; i++)
{
if (source[matchStartPos + i] == pattern[i])
{
matchingChars++;
if ((double)matchingChars / (double)pattern.Length >= similarity)
return true;
}
int charsRemaining = source.Length - (i + 1);
if (charsRemaining + matchingChars <= charsRequiredForMatch) // Give up if it's not possible to reach the required number of characters for a match
return false;
}
}
return false;
}
}
}
Do the patterns that match 80% or better need to occur in the same position in each string, or anywhere?
e.g.
Given:
or does it need to be
e.g.
Given:
ABCDEFWWW
WWWGHIJKL
is WWW a match?or does it need to be
ABCDEFWWW
GHIJKLWWW
ASKER
The second one is a match. Your code is awesome. I may reduce the pattern length to 50 or 100 by experience, otherwise it will take too long.
Okay, if WWW only matches in the case of (because WWW occurs at index 6 in both strings)
Then the code in the ContainsSimilar extension method could be sped up significantly.
ABCDEFWWWMNOPQR
GHIJKLWWWSTUVWX
But not in the case ofABCDEFMNOPQRWWW
WWWGHIJKLSTUVWX
Then the code in the ContainsSimilar extension method could be sped up significantly.
public static bool ContainsSimilar(this string source, string pattern, double similarity, int index)
{
int matchingChars = 0;
int charsRequiredForMatch = (int)Math.Ceiling(pattern.Length * similarity);
for (int i = 0; i < pattern.Length; i++)
{
if (source[index + i] == pattern[i])
{
matchingChars++;
if ((double)matchingChars / (double)pattern.Length >= similarity)
return true;
}
int charsRemaining = source.Length - (i + 1);
if (charsRemaining + matchingChars <= charsRequiredForMatch)
return false;
}
return false;
}
Better give ya the whole thing...
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
string str1 = @"TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGATACAGTAGGTGTCTCTATCTCTTCCTATAGTGCAGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCATGGTATTGAGCCTACACATGGTTGTGAAACAAGAATTGCTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACTTTTCTTTCACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAAAATTTACCTGCAACTTAAAAGTGATC";
string str2 = @"TGTCTATCCAAAAGTTACCTTGAATCCTTAGAATAAGAAGAAGTCATTTAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGGTGTCTCTATCTCTTCCTATAGTGCAAAGAATTGCAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCAATACAGTAGTGGTATTGAGCCTACACATGGTTGTGAAACTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAACTGCAACTTAAAAGTGATCCTTTTCTTTGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATTTAC";
string test = @"AGTAAAAAGAAGGCTGTTGT--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------";
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
string longestCommonSubstring = FindLongestCommonSubstring(str1, str2);
stopwatch.Stop();
if (longestCommonSubstring != null)
{
Console.WriteLine("Elapsed time: {0} seconds.", stopwatch.Elapsed.TotalSeconds);
Console.WriteLine("Found: {0}", longestCommonSubstring);
}
else
Console.WriteLine("Did not find ANY common substrings (should never happen)");
Console.ReadKey();
}
static string FindLongestCommonSubstring(params string[] stringsToCheck)
{
string longestPattern = String.Empty; ;
// Since we're looking for the longest possible common substring,
// start with the entire string as a pattern, then decrement the
// pattern by one until we find a match in all the strings
for (int patternLen = stringsToCheck[0].Length; patternLen > 0; patternLen--)
{
for (int patternStartPos = 0; patternStartPos + patternLen <= stringsToCheck[0].Length; patternStartPos++)
{
string pattern = stringsToCheck[0].Substring(patternStartPos, patternLen);
// If this pattern occurs in all the strings, it is
// the longest common substring/sub-pattern
if (stringsToCheck.All(x => x.ContainsSimilar(pattern, .8, patternStartPos))) // If all the strings contain this pattern where at least 80% of the characters match
return pattern;
}
}
return null;
}
}
public static class Extensions
{
public static bool ContainsSimilar(this string source, string pattern, double similarity, int index)
{
int matchingChars = 0;
int charsRequiredForMatch = (int)Math.Ceiling(pattern.Length * similarity);
for (int i = 0; i < pattern.Length; i++)
{
if (source[index + i] == pattern[i])
{
matchingChars++;
if ((double)matchingChars / (double)pattern.Length >= similarity)
return true;
}
int charsRemaining = source.Length - (i + 1);
if (charsRemaining + matchingChars <= charsRequiredForMatch)
return false;
}
return false;
}
/* Old Stuff
public static bool ContainsSimilar(this string source, string pattern, double similarity)
{
for (int matchStartPos = 0; matchStartPos <= source.Length - pattern.Length; matchStartPos++)
{
// try comparing the pattern to the source with both their starts lined up initially
// each iteration of the loop effectively "moves" the pattern one character further
// down source for another comparison
int matchingChars = 0;
int charsRequiredForMatch = (int)Math.Ceiling((double)pattern.Length * similarity);
for (int i = 0; i < pattern.Length; i++)
{
if (source[matchStartPos + i] == pattern[i])
{
matchingChars++;
if ((double)matchingChars / (double)pattern.Length >= similarity)
return true;
}
int charsRemaining = source.Length - (i + 1);
if (charsRemaining + matchingChars <= charsRequiredForMatch) // Give up if it's not possible to reach the required number of characters for a match
return false;
}
}
return false;
}
*/
}
}
Please try the following approach:
Here is the resulting output: (position is zero-based)
Elapsed Time: 0.0014085 seconds
Ratio: 43.5%
Longest Position: 49
Longest Length: 266
Longest Value: AGTAAAAAGAAGGCTGTTGTTCCGTG AAATACTGTC TTTATGCCTC AGATTTGGAG TGCTCAGAGC CTCTGCAGCA AAGATTTGGC ATGTGTCCTA GGCCTGCTCA GAGCAGCAAA TCCCACCCTC TTGGAGAATG AGACTCATAG AGGGACAGCT CCCTCCTCAG AGGCTTCTCT AATGGGACTC CAAAGAGCAA ACACTCAGCC CCATGAGGAC TGGCCAGGCC AAGTGGTGTG TGGGAACAGG GAGCAGCGGT TTCCAAGAGG
public static void Main()
{
string str1 = "TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGATACAGTAGGTGTCTCTATCTCTTCCTATAGTGCAGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCATGGTATTGAGCCTACACATGGTTGTGAAACAAGAATTGCTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACTTTTCTTTCACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAAAATTTACCTGCAACTTAAAAGTGATC";
string str2 = "TGTCTATCCAAAAGTTACCTTGAATCCTTAGAATAAGAAGAAGTCATTTAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGGTGTCTCTATCTCTTCCTATAGTGCAAAGAATTGCAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCAATACAGTAGTGGTATTGAGCCTACACATGGTTGTGAAACTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAACTGCAACTTAAAAGTGATCCTTTTCTTTGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATTTAC";
var stopwatch = new Stopwatch();
stopwatch.Start();
var similarity = CalculateSimilarity(str1, str2);
stopwatch.Stop();
Console.WriteLine("Elapsed Time: {0} seconds\nRatio: {1}%\nLongest Position: {2}\nLongest Length: {3}\nLongest Value: {4}",
stopwatch.Elapsed.TotalSeconds,
similarity.Ratio * 100,
similarity.Longest.Index,
similarity.Longest.Length,
similarity.Longest.Value);
}
private static Similarity CalculateSimilarity(string str1, string str2)
{
var sb = new StringBuilder(str1);
int matches = sb.Length;
for (var i = 0; i < sb.Length; i++)
{
if (sb[i] != str2[i])
{
sb[i] = '-';
matches--;
}
}
var longest = (from match in Regex.Matches(sb.ToString(), "[ACGT]+").Cast<Match>()
orderby match.Value.Length descending
select match).FirstOrDefault();
return new Similarity() {Longest = longest, Ratio = (double)matches / (double)sb.Length};
}
public class Similarity
{
public Match Longest {get; set;}
public double Ratio;
}
Here is the resulting output: (position is zero-based)
Elapsed Time: 0.0014085 seconds
Ratio: 43.5%
Longest Position: 49
Longest Length: 266
Longest Value: AGTAAAAAGAAGGCTGTTGTTCCGTG
Here is another version that takes any number of sequences:
public static void Main()
{
string str1 = "TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGATACAGTAGGTGTCTCTATCTCTTCCTATAGTGCAGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCATGGTATTGAGCCTACACATGGTTGTGAAACAAGAATTGCTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACTTTTCTTTCACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAAAATTTACCTGCAACTTAAAAGTGATC";
string str2 = "TGTCTATCCAAAAGTTACCTTGAATCCTTAGAATAAGAAGAAGTCATTTAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGGTGTCTCTATCTCTTCCTATAGTGCAAAGAATTGCAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCAATACAGTAGTGGTATTGAGCCTACACATGGTTGTGAAACTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAACTGCAACTTAAAAGTGATCCTTTTCTTTGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATTTAC";
string test = "TGTCTATCCAAAAGTTACCTTGAATCCTTAGAATAAGAAGAAGTCATTTAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGGTGTCTCTATCTCTTCCTATAGTGCAAAGAATTGCAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCAATACAGTAGTGGTATTGAGCCTACACATGGTTGTGAAACTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAACTGCAACTTAAAAGTGATCCTTTTCTTTGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATTTAC";
var stopwatch = new Stopwatch();
stopwatch.Start();
var similarity = CalculateSimilarity(str1, str2, test);
stopwatch.Stop();
Console.WriteLine("Elapsed Time: {0} seconds\nRatio: {1}%\nLongest Position: {2}\nLongest Length: {3}\nLongest Value: {4}",
stopwatch.Elapsed.TotalSeconds,
similarity.Ratio * 100,
similarity.Longest.Index,
similarity.Longest.Length,
similarity.Longest.Value);
}
private static Similarity CalculateSimilarity(params string[] sequences)
{
/*
if (sequences.Length < 2)
{
throw new Exception("At least 2 DNA sequences are required.");
}
if (sequences.Any(seq => seq.Length != sequences[0].Length))
{
throw new Exception("All DNA sequences must be of the same size.");
}
*/
var sb = new StringBuilder(sequences[0]);
int matches = sb.Length;
for (var i = 0; i < sb.Length; i++)
{
char ch = sb[i];
if (sequences.Any(seq => seq[i] != ch))
{
sb[i] = '-';
matches--;
}
}
var longest = (from match in Regex.Matches(sb.ToString(), "[ACGT]+").Cast<Match>()
orderby match.Value.Length descending
select match).FirstOrDefault();
return new Similarity() {Longest = longest, Ratio = (double)matches / (double)sb.Length};
}
public class Similarity
{
public Match Longest {get; set;}
public double Ratio;
}
ASKER
Hi wdosanjos:
Your code is awesome as well. Can nyou consider a given ratio such 0.8 then search and match?
The definition of the ratio such as 0.8 is at MS C# Forum
Your code is awesome as well. Can nyou consider a given ratio such 0.8 then search and match?
The definition of the ratio such as 0.8 is at MS C# Forum
@wdosanjos
Can you comment that code, and give a little bit of an explanation? I'm having a hard time trying to follow it...
Can you comment that code, and give a little bit of an explanation? I'm having a hard time trying to follow it...
I was re-reading all posts carefully, and I believe the solution I posted does not address your problem. It assumes that the sequences are aligned, so only letters on the same string position are identified as matches.
ASKER
Hi tgerbert,
I just test your code using a simple string array
I just test your code using a simple string array
var strings = new string[] { "123456ABCDEyyy", "123ABCDE456BBB", "125891ABCDEljk", "1.2.-3-SBCDE#" };
The result is wrong, could you please double check it? Thanks.
ASKER
If we take a sample
{ "456ABCDEyyy", "891ABCDEljk", "1.3-SBCDE#" };
Then step into the code, I found that in the most of time, item 1 and item 2 were comparing. Item 3 seems to be forgotten.
That's to be expected - sort of. If a particular pattern isn't found with 80% similarity in string 1 and string 2, then there's no sense checking string 3 at all (since we only want patterns occurring in all three). The Framework's "All" extension method would apply this logic.
I probably have a bug somewhere else, I'll take a closer look in a few minutes.
I probably have a bug somewhere else, I'll take a closer look in a few minutes.
The code example I posted assumes that all the strings in the array are the same length. Also, it basically pulls substrings out of the first string and then checks if they exist in all the other strings. And the most recent example I posted only matches if the substrings all start in the same position (based on my comments in http:#a35244850 and your response in http:#a35245071).
So, if you change your sample strings to "456ABCDEyyy", "891ABCDEljk" and "1.3-BCDE#qr", then:
If you change the order of the strings to "1.3-BCDE#qr", "456ABCDEyyy" and "891ABCDEljk" then the matching pattern becomes "-BCDE"
So, if you change your sample strings to "456ABCDEyyy", "891ABCDEljk" and "1.3-BCDE#qr", then:
456ABCDEyyy
|||||
891ABCDEljk
||||
1.3-BCDE#qr
The pattern "ABCDE" is a 100% match on the first string and the second string, and an 80% match with the third string.If you change the order of the strings to "1.3-BCDE#qr", "456ABCDEyyy" and "891ABCDEljk" then the matching pattern becomes "-BCDE"
1.3-BCDE#qr
||||
456ABCDEyyy
||||
891ABCDEljk"
"-BCDE" is a 100% match for the first string, and an 80% match for the second and third strings. In reality there will always be equal alternatives, I'm not sure how, or even if, it's possible decide between them; e.g. "BCDE#", "BCDEy", "BCDEl", and "ABCDE" all meet the criteria.
If you want to use the strings "-BCDE1.3#qr", "456ABCDEyyy" and "891ljkABCDE" to find the pattern "-BCDE", then the code in http:#a35244687 will work.
ASKER
In my case all the strings in the array are the same length. The hardest part is the substrings may have different start positions. I tried to enumerate all substrings then compare each other but out of memory.
ASKER CERTIFIED SOLUTION
membership
Create a free account to see this answer
Signing up is free and takes 30 seconds. No credit card required.
ASKER
Let me try the real world long strings.
Using the two long strings from above,
I get this output:
This could also probably be sped up by running 1 thread for each processing core in the system, and split up the workload.
string str1 = @"TTTATGTCTATAATCCTTACCAAAAGTTACCTTGGAATAAGAAGAAGTCAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGATACAGTAGGTGTCTCTATCTCTTCCTATAGTGCAGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCATGGTATTGAGCCTACACATGGTTGTGAAACAAGAATTGCTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACTTTTCTTTCACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAAAATTTACCTGCAACTTAAAAGTGATC";
string str2 = @"TGTCTATCCAAAAGTTACCTTGAATCCTTAGAATAAGAAGAAGTCATTTAGTAAAAAGAAGGCTGTTGTTCCGTGAAATACTGTCTTTATGCCTCAGATTTGGAGTGCTCAGAGCCTCTGCAGCAAAGATTTGGCATGTGTCCTAGGCCTGCTCAGAGCAGCAAATCCCACCCTCTTGGAGAATGAGACTCATAGAGGGACAGCTCCCTCCTCAGAGGCTTCTCTAATGGGACTCCAAAGAGCAAACACTCAGCCCCATGAGGACTGGCCAGGCCAAGTGGTGTGTGGGAACAGGGAGCAGCGGTTTCCAAGAGGGTGTCTCTATCTCTTCCTATAGTGCAAAGAATTGCAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCAATACAGTAGTGGTATTGAGCCTACACATGGTTGTGAAACTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAACTGCAACTTAAAAGTGATCCTTTTCTTTGGCTCCAAAGCTTATCAGTGCTCCATGGCCTGCATTGAGCAAGGTCCTGGAATTTAC";
I get this output:
Elapsed time: 117.9966536 seconds.
Found: GCATTGAGCAAGGTCCTGGAATAGGGTTCAGGGCAGTGATAAATTCCAATGCTAATGTGGGATGAACCCCTGGGGGCTTCACCTCCTTCTTTTAGGATGTCTGTGGAAATTGAGAAAGACCCTCTCTCCATGGTATTGAGCCTACACATGGTTGTGAAACAAGAATTGCTTTGACTGAATCTCACTTTTTTCTAGGTGGACTCACAGAGTCCCTTACATTCATTATTAGTTCATTCGTTCATCCACTCATTATCTTACTCTGTATCAATCCCAGGGCTGAGAACTTACTTTCCCCATTGTCCTCATTTATCTAACAAGCCATTCTCCTATTGACCTCATTTTCTTAGTCTGATGAACACTGGCTGCTAGAGAAAATCTTACCAATAAGCAGATTGACTTCAGTGAATATTCATAGTCTGCAAGTTCAACTGGCTCTTTGAATGCCTGGTCTTTCCTTTGACTACCTTAAACCTTCTCTACTCTCTTCAAACCACTTTTCTTTCACTCTCAGCATATATTTTCACTCTCCTATTTCCAAATAAAATAGAATTCATCAGACCGTTTCCCCTGTTTTTTCCCTCCAGCCCATCTAAAATTTACCTGCAACT
This could also probably be sped up by running 1 thread for each processing core in the system, and split up the workload.
ASKER
This could also probably be sped up by running 1 thread for each processing core in the system, and split up the workload.I agree. We have a 64GB machine and 10 cores. But I am not strong on multithread at all.
sample:
if (textbox1.Text.ToUpper.Con
or
if (Xstring.ToUpper.contains(