首先,为了提供完整的信息披露,我想指出这与机器学习课程中的家庭作业有关。这个问题不是作业问题,而是我需要弄清楚的,以便完成创建ID3决策树算法的更大问题。
当给出真值表时,我需要生成类似于以下的树
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
learnedTree属于BinaryTree类型,我将其定义如下:
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
ID3算法考虑了各种方程式以确定在哪里拆分树,而且我已经弄明白了,我只是在从真值表创建学习树时遇到了麻烦。例如,如果我有下表
A1 | A2 | A3 | Class
1 0 0 1
0 1 0 1
0 0 0 0
1 0 1 0
0 0 0 0
1 1 0 1
0 1 1 0
我决定拆分属性A1,我最终得到以下内容:
(A1 = 1) A1 (A1 = 0)
A2 | A3 | Class A2 | A3 | Class
0 0 1 1 0 1
0 1 0 0 0 0
1 0 1 0 0 0
0 1 1
然后我将分割左侧并分割右侧,并继续递归模式,直到叶子节点是纯净的,并且最终得到一个类似于以下基于分割的树。
let learnedTree = Node(0,"A0", Node(2,"A2", Leaf(0), Leaf(1)), Node(1,"A1", Node(2,"A2", Leaf(0), Leaf(1)), Leaf(0)))
到目前为止,这是我一直“被黑了”的事情,但我想我可能会离开:
let rec createTree (listToSplit : list<list<float>>) index =
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(index) = 0. then Some(x) else None)
if leftSideSplit.Length > 0 then
let pureCheck = isListPure leftSideSplit
if pureCheck = 0 then
printfn "%s" "Pure left node class 0"
createTree leftSideSplit (index + 1)
else if pureCheck = 1 then
printfn "%s" "Pure left node class 1"
createTree leftSideSplit (index + 1)
else
printfn "%s - %A" "Recursing Left" leftSideSplit
createTree leftSideSplit (index + 1)
else printfn "%s" "Pure left node class 0"
我应该使用模式匹配吗?任何提示/想法/帮助?非常感谢!
答案 0 :(得分:6)
编辑:我已经在我的博客上发布了ID3的实现: http://blogs.msdn.com/chrsmith
嘿Jim,我一直想写一篇在F#中实现ID3的博客帖子一段时间 - 感谢给我一个执行。虽然这段代码没有完全(或正确)实现算法,但它应该足以让你开始。一般来说,你有正确的方法 - 将每个分支代表一个有区别的联合案例是好的。和Brian说的一样,List.partition绝对是一个方便的功能。使这项工作正确的诀窍在于确定要拆分的最佳属性/值对 - 为此,您需要通过熵等计算信息增益。
type Attribute = string
type Value = string
type Record =
{
Weather : string
Temperature : string
PlayTennis : bool
}
override this.ToString() =
sprintf
"{Weather = %s, Temp = %s, PlayTennis = %b}"
this.Weather
this.Temperature
this.PlayTennis
type Decision = Attribute * Value
type DecisionTreeNode =
| Branch of Decision * DecisionTreeNode * DecisionTreeNode
| Leaf of Record list
// ------------------------------------
// Splits a record list into an optimal split and the left / right branches.
// (This is where you use the entropy function to maxamize information gain.)
// Record list -> Decision * Record list * Record list
let bestSplit data =
// Just group by weather, then by temperature
let uniqueWeathers =
List.fold
(fun acc item -> Set.add item.Weather acc)
Set.empty
data
let uniqueTemperatures =
List.fold
(fun acc item -> Set.add item.Temperature acc)
Set.empty
data
if uniqueWeathers.Count = 1 then
let bestSplit = ("Temperature", uniqueTemperatures.MinimumElement)
let left, right =
List.partition
(fun item -> item.Temperature = uniqueTemperatures.MinimumElement)
data
(bestSplit, left, right)
else
let bestSplit = ("Weather", uniqueWeathers.MinimumElement)
let left, right =
List.partition
(fun item -> item.Weather = uniqueWeathers.MinimumElement)
data
(bestSplit, left, right)
let rec determineBranch data =
if List.length data < 4 then
Leaf(data)
else
// Use the entropy function to break the dataset on
// the category / value that best splits the data
let bestDecision, leftBranch, rightBranch = bestSplit data
Branch(
bestDecision,
determineBranch leftBranch,
determineBranch rightBranch)
// ------------------------------------
let rec printID3Result indent branch =
let padding = new System.String(' ', indent)
match branch with
| Leaf(data) ->
data |> List.iter (fun item -> printfn "%s%s" padding <| item.ToString())
| Branch(decision, lhs, rhs) ->
printfn "%sBranch predicate [%A]" padding decision
printfn "%sWhere predicate is true:" padding
printID3Result (indent + 4) lhs
printfn "%sWhere predicate is false:" padding
printID3Result (indent + 4) rhs
// ------------------------------------
let dataset =
[
{ Weather = "windy"; Temperature = "hot"; PlayTennis = false }
{ Weather = "windy"; Temperature = "cool"; PlayTennis = false }
{ Weather = "nice"; Temperature = "cool"; PlayTennis = true }
{ Weather = "nice"; Temperature = "cold"; PlayTennis = true }
{ Weather = "humid"; Temperature = "hot"; PlayTennis = false }
]
printfn "Given input list:"
dataset |> List.iter (printfn "%A")
printfn "ID3 split resulted in:"
let id3Result = determineBranch dataset
printID3Result 0 id3Result
答案 1 :(得分:5)
您可以使用List.partition代替两个List.choose调用。
(或现在http://msdn.microsoft.com/en-us/library/ee353738(VS.100).aspx)
我不清楚模式匹配会给你带来多少收获;输入类型(列表列表)和处理(分区和'纯度'检查)并没有真正适用于此。
当然,当你最终获得'结束'(一个纯粹的列表)时,你需要创建一个树,然后当输入只有一个'side'并且它是'pure'时,这个函数可能会创建一个Leaf,但是在每个其他输入的左侧和右侧结果中创建一个节点。也许。我没有完全理解算法。
希望这会有助于引导你一点点。绘制一些较小的样本输入和输出可能有助于解决函数体的各种情况。
答案 2 :(得分:1)
感谢Brian&amp;克里斯!我实际上能够弄清楚这一点,我最终得到了以下内容。这将计算用于确定最佳拆分位置的信息增益。我相信我可能有更好的方法来达到这个解决方案,特别是在选择的数据结构周围,但这是一个开始。我打算稍后改进一下。
#light
open System
let trainList =
[
[1.;0.;0.;1.;];
[0.;1.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;1.;0.;];
[0.;0.;0.;0.;];
[1.;1.;0.;1.;];
[0.;1.;1.;0.;];
[1.;0.;0.;1.;];
[0.;0.;0.;0.;];
[1.;0.;0.;1.;];
]
type BinaryTree =
| Leaf of int
| Node of int * string * BinaryTree * BinaryTree
let entropyList nums =
let sumOfnums =
nums
|> Seq.sum
nums
|> Seq.map (fun x -> if x=0.00 then x else (-((x/sumOfnums) * Math.Log(x/sumOfnums, 2.))))
|> Seq.sum
let entropyBinaryList (dataListOfLists:list<list<float>>) =
let classList =
dataListOfLists
|> List.map (fun x -> x.Item(x.Length - 1))
let ListOfNo =
classList
|> List.choose (fun x -> if x = 0. then Some(x) else None)
let ListOfYes =
classList
|> List.choose (fun x -> if x = 1. then Some(x) else None)
let numberOfYes : float = float ListOfYes.Length
let numberOfNo : float = float ListOfNo.Length
let ListOfNumYesAndSumNo = [numberOfYes; numberOfNo]
entropyList ListOfNumYesAndSumNo
let conditionalEntropy (dataListOfLists:list<list<float>>) attributeNumber =
let NoAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 0. then Some(x) else None)
let YesAttributeList =
dataListOfLists
|> List.choose (fun x -> if x.Item(attributeNumber) = 1. then Some(x) else None)
let numberOfYes : float = float YesAttributeList.Length
let numberOfNo : float = float NoAttributeList.Length
let noConditionalEntropy = (entropyBinaryList NoAttributeList) * (numberOfNo/(numberOfNo + numberOfYes))
let yesConditionalEntropy = (entropyBinaryList YesAttributeList) * (numberOfYes/(numberOfNo + numberOfYes))
[noConditionalEntropy; yesConditionalEntropy]
let findBestSplitIndex(listOfInstances : list<list<float>>) =
let IGList =
[0..(listOfInstances.Item(0).Length - 2)]
|> List.mapi (fun i x -> (i, (entropyBinaryList listOfInstances) - (List.sum (conditionalEntropy listOfInstances x))))
IGList
|> List.maxBy snd
|> fst
let isListPure (listToCheck : list<list<float>>) =
let splitList = listToCheck |> List.choose (fun x -> if x.Item(x.Length - 1) = 1. then Some(x) else None)
if splitList.Length = listToCheck.Length then 1
else if splitList.Length = 0 then 0
else -1
let rec createTree (listToSplit : list<list<float>>) =
let pureCheck = isListPure listToSplit
if pureCheck = 0 then
printfn "%s" "Pure - Leaf(0)"
else if pureCheck = 1 then
printfn "%s" "Pure - Leaf(1)"
else
printfn "%A - is not pure" listToSplit
if listToSplit.Length > 1 then // There are attributes we can split on
// Chose best place to split list
let splitIndex = findBestSplitIndex(listToSplit)
printfn "spliting at index %A" splitIndex
let leftSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 1. then Some(x) else None)
let rightSideSplit =
listToSplit |> List.choose (fun x -> if x.Item(splitIndex) = 0. then Some(x) else None)
createTree leftSideSplit
createTree rightSideSplit
else
printfn "%s" "Not Pure, but can't split choose based on heuristics - Leaf(0 or 1)"