二进制余弦系数

时间:2010-03-19 16:11:36

标签: c++

我得到了以下论坛来计算这个

SIM = |Q∩D| /√| Q |√| D |

我去了一个类,用来比较由一系列单词组成的字符串

#pragma once

#include <vector>
#include <string>
#include <iostream>
#include <vector>

using namespace std;

class StringSet
{
public:
 StringSet(void);
 StringSet( const string the_strings[], const int no_of_strings);
 ~StringSet(void);
 StringSet( const vector<string> the_strings);
 void add_string( const string the_string);
 bool remove_string( const string the_string);
 void clear_set(void);
 int no_of_strings(void) const;
 friend ostream& operator <<(ostream& outs, StringSet& the_strings);
 friend StringSet operator *(const StringSet& first, const StringSet& second);
 friend StringSet operator +(const StringSet& first, const StringSet& second);
 double binary_coefficient( const StringSet& the_second_set);

private:
 vector<string> set;
};

#include "StdAfx.h"
#include "StringSet.h"
#include <iterator>
#include <algorithm>
#include <stdexcept>
#include <iostream>
#include <cmath>


StringSet::StringSet(void)
{
}

StringSet::~StringSet(void)
{
}

StringSet::StringSet( const vector<string> the_strings)
{
 set = the_strings;
}

StringSet::StringSet( const string the_strings[], const int no_of_strings)
{
 copy( the_strings, &the_strings[no_of_strings], back_inserter(set));
}

void StringSet::add_string( const string the_string)
{
 try
 {
  if( find( set.begin(), set.end(), the_string) == set.end())
  {
   set.push_back(the_string);
  }
  else
  {
   //String is already in the set.
   throw domain_error("String is already in the set");
  }
 }
 catch( domain_error e)
 {
  cout << e.what();
  exit(1);
 }

}

bool StringSet::remove_string( const string the_string)
{
 //Found the occurrence of the string. return it an iterator pointing to it.
 vector<string>::iterator iter;
 if( ( iter = find( set.begin(), set.end(), the_string) ) != set.end())
 {
  set.erase(iter);
  return true;
 }
 return false;
}
void StringSet::clear_set(void)
{
 set.clear(); 
}

int StringSet::no_of_strings(void) const
{
 return set.size();
}

ostream& operator <<(ostream& outs, StringSet& the_strings)
{
 vector<string>::const_iterator const_iter = the_strings.set.begin();
 for( ; const_iter != the_strings.set.end(); const_iter++)
 {
  cout << *const_iter << " ";
 }
 cout << endl;
 return outs;
}

//This function returns the union of the two string sets.

StringSet operator *(const StringSet& first, const StringSet& second)
{
 vector<string> new_string_set;
 new_string_set = first.set;
 for( unsigned int i = 0; i < second.set.size(); i++)
 {
  vector<string>::const_iterator const_iter = find(new_string_set.begin(), new_string_set.end(), second.set[i]);
  //String is new - include it.
  if( const_iter == new_string_set.end() )
  {
   new_string_set.push_back(second.set[i]);
  }
 }
 StringSet the_set(new_string_set);
 return the_set;
}
//This method returns the intersection of the two string sets.

StringSet operator +(const StringSet& first, const StringSet& second)
{
 //For each string in the first string look though the second and see if
 //there is a matching pair, in which case include the string in the set.
 vector<string> new_string_set;
 vector<string>::const_iterator const_iter = first.set.begin();
 for ( ; const_iter != first.set.end(); ++const_iter)
 {
  //Then search through the entire second string to see if
  //there is a duplicate.
  vector<string>::const_iterator const_iter2 = second.set.begin();
  for( ; const_iter2 != second.set.end(); const_iter2++)
  {
   if( *const_iter == *const_iter2 )
   {
    new_string_set.push_back(*const_iter);
   }
  }
 }
 StringSet new_set(new_string_set);
 return new_set;

}

double StringSet::binary_coefficient( const StringSet& the_second_set)
{
 double coefficient;
 StringSet intersection = the_second_set + set;

 coefficient = intersection.no_of_strings() / sqrt((double) no_of_strings()) * sqrt((double)the_second_set.no_of_strings());
 return coefficient;
}

然而,当我尝试使用以下主函数计算系数时:

// Exercise13.cpp : main project file.

#include "stdafx.h"
#include <boost/regex.hpp>
#include "StringSet.h"

using namespace System;
using namespace System::Runtime::InteropServices;

using namespace boost;

//This function takes as input a string, which 
//is then broken down into a series of words
//where the punctuaction is ignored.



StringSet break_string( const string the_string)
{
 regex re;
 cmatch matches;
 StringSet words;
 string search_pattern = "\\b(\\w)+\\b";

 try
 {
  // Assign the regular expression for parsing.
  re = search_pattern;
 }
 catch( regex_error& e)
 {
  cout << search_pattern << " is not a valid regular expression: \""
   << e.what() << "\"" << endl;
  exit(1);
 }

 sregex_token_iterator p(the_string.begin(), the_string.end(), re, 0);
 sregex_token_iterator end;
 for( ; p != end; ++p)
 {
   string new_string(p->first, p->second);
   String^ copy_han = gcnew String(new_string.c_str());
   String^ copy_han2 = copy_han->ToLower();
   char* str2 = (char*)(void*)Marshal::StringToHGlobalAnsi(copy_han2);
   string new_string2(str2);
   words.add_string(new_string2);
 }


 return words;


}

int main(array<System::String ^> ^args)
{
 StringSet words = break_string("Here is a string, with some; words");
 StringSet words2 = break_string("There is another string,");

 cout << words.binary_coefficient(words2);
    return 0;
}

我得到一个1.5116的索引而不是0到1的值。

有没有人知道为什么会这样?

任何帮助都将不胜感激。

2 个答案:

答案 0 :(得分:2)

在最终计算中需要更多括号。 a / b * c被解析为(a / b) * c,但您需要a / (b * c)

答案 1 :(得分:0)

也许这只是一个优先事项

coefficient = intersection.no_of_strings() / sqrt((double) no_of_strings()) * sqrt((double)the_second_set.no_of_strings());

没有指定你必须首先乘以,然后除。他们的优先权是相同的,但我不确定选择的行为..你是否尝试指定它:

coefficient = intersection.no_of_strings() / (sqrt((double) no_of_strings()) * sqrt((double)the_second_set.no_of_strings()));