エンジニアのソフトウェア的愛情

または私は如何にして心配するのを止めてプログラムを・愛する・ようになったか

統計的に正しいランキングを行う方法をC++とかHaskellとかで書く

http://d.hatena.ne.jp/makiyamakoji/20090512/p1のコードを参考に(というか、ほとんど一緒ですが)C++で書き直してみた。

#include <algorithm>
#include <stdexcept>
#include <cmath>

double calcNormalDistPercentile(double percent)
{
    if((percent <= 0) || (1 <= percent))
    {
        throw std::range_error("INVALID PERCENT");
    }

    if(percent == 0.5)
    {
        return 0;
    }

    double b[] =
    {
         1.570796288,
         0.03706987906,
        -0.8364353589e-3,
        -0.2250947176e-3,
         0.6841218299e-5,
         0.5824238515e-5,
        -0.104527497e-5,
         0.8360937017e-7,
        -0.3231081277e-8,
         0.3657763036e-10,
         0.6936233982e-12
    };

    int b_length = sizeof(b)/sizeof(*b);

    double w3 = -std::log(4 * percent * (1 - percent));
    double w1 = b[0];

    for(int i = 1; i < b_length; ++i)
    {
        w1 += b[i] * std::pow(w3, i);
    }

    if(percent > 0.5)
    {
        return std::sqrt(w1 * w3);
    }
    else
    {
        return -std::sqrt(w1 * w3);
    }
}

double calcScore(int posCount, int negCount, double percent)
{
    int    n  = posCount + negCount;
    double p  = static_cast<double>(posCount) / n;
    double z  = calcNormalDistPercentile(percent);
    double z2 = z * z;

    double temp = p * (1 - p) + z2 / (4 * n);
    double numerator = p + (z2 / (2 * n)) - z * std::sqrt(temp / n);
    double denominator = 1 + z2 / n;

    return numerator / denominator;
}

template<int N>
double calcScore5(int (&ratings)[N], double percent)
{
    int poscount = 0;
    int negcount = 0;

    for(int i = 0; i < N; ++i)
    {
        if     (ratings[i] == 5) { poscount += 4;                }
        else if(ratings[i] == 4) { poscount += 3; negcount += 1; }
        else if(ratings[i] == 3) { poscount += 2; negcount += 2; }
        else if(ratings[i] == 2) { poscount += 1; negcount += 3; }
        else if(ratings[i] == 1) {                negcount += 4; }
    }

    return calcScore(poscount, negcount, percent);
}

template<int N>
double calcScore(int (&ratings)[N], double percent)
{
    int poscount = std::count(ratings, ratings + N, 1);
    int negcount = std::count(ratings, ratings + N, 0);

    return calcScore(poscount, negcount, percent);
}

#include <iostream>
#include <iomanip>

int main(int, char* [])
{
    std::cout << std::setprecision(15);

    int rating1[] = { 0, 1, 0, 1, 1, 1, 0, 1 };
    int rating2[] = { 1 };
    std::cout << calcScore(rating1, 0.95) << std::endl;
    std::cout << calcScore(rating2, 0.95) << std::endl;

    int rating3[] = { 4, 4, 5, 5, 5, 5, 5, 5, 5, 5 };
    int rating4[] = { 5 };
    std::cout << calcScore5(rating3, 0.95) << std::endl;
    std::cout << calcScore5(rating4, 0.95) << std::endl;
}


実行結果。

0.34799143561252
0.269865944074627
0.859668176956633
0.596521369044698


で、こういう計算こそ関数型言語の出番でしょ、ってことでHaskellでも書いてみた。直訳なのでまだちょっと不細工。

module Score where

b = [  1.570796288,
       0.03706987906,
      -0.8364353589e-3,
      -0.2250947176e-3,
       0.6841218299e-5,
       0.5824238515e-5,
      -0.104527497e-5,
       0.8360937017e-7,
      -0.3231081277e-8,
       0.3657763036e-10,
       0.6936233982e-12 ]

count s n = length [x | x <- s, x == n]

calcNormalDistPercentile percent
  | (percent <= 0) || (1 <= percent) = 0/0 -- NaN
  | percent == 0.5                   = 0
  | otherwise                        = sqrt $ (((w (tail b) 1) + (head b))) * w3
                                         where
                                           w3 = - (log (4 * percent * (1 - percent)))
                                           w (b:bs) i =  (b * (w3 ^ i)) + (w bs (i + 1))
                                           w [] _     = 0

calcScore' posCount negCount percent =
  numerator / denominator
    where
      n  = posCount + negCount
      p  = posCount / n
      z  = calcNormalDistPercentile percent
      z2 = z * z
      temp = p * (1 - p) + z2 / (4 * n)
      numerator = p + (z2 / (2 * n)) - z * (sqrt (temp / n))
      denominator = 1 + z2 / n

calcScore ratings percent =
  calcScore' posCount negCount percent
    where
      posCount = toEnum $ count ratings 1
      negCount = toEnum $ count ratings 0

calcScore5 ratings percent =
  calcScore' posCount negCount percent
    where
      r5 = count ratings 5
      r4 = count ratings 4
      r3 = count ratings 3
      r2 = count ratings 2
      r1 = count ratings 1
      posCount = toEnum $ (r5 * 4) + (r4 * 3) + (r3 * 2) + (r2 * 1) + (r1 * 0)
      negCount = toEnum $ (r5 * 0) + (r4 * 1) + (r3 * 2) + (r2 * 3) + (r1 * 4)
module Main where

import Score

rating1 = [ 0, 1, 0, 1, 1, 1, 0, 1 ]
rating2 = [ 1 ]
rating3 = [ 4, 4, 5, 5, 5, 5, 5, 5, 5, 5 ]
rating4 = [ 5 ]

main = do
  print $ calcScore rating1 0.95
  print $ calcScore rating2 0.95
  print $ calcScore5 rating3 0.95
  print $ calcScore5 rating4 0.95


実行結果。

0.3479914356125201
0.2698659440746267
0.8596681769566334
0.5965213690446981