Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cslib.lean
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,5 @@ public import Cslib.Logics.Modal.Denotation
public import Cslib.Logics.Propositional.Defs
public import Cslib.Logics.Propositional.NaturalDeduction.Basic
public import Cslib.MachineLearning.PACLearning.Defs
public import Cslib.MachineLearning.PACLearning.VersionSpace
public import Cslib.Probability.PMF
296 changes: 296 additions & 0 deletions Cslib/MachineLearning/PACLearning/VersionSpace.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
/-
Copyright (c) 2026 Dhruv Gupta. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Dhruv Gupta
-/

module

public import Cslib.MachineLearning.PACLearning.Defs
public import Mathlib.MeasureTheory.Measure.Dirac
public import Mathlib.MeasureTheory.Measure.Map

/-! # Version Space
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets move this above the section to match the rest of the files in cslib.


The *version space* of a concept class `C` given a labeled sample `S` is the
subset of `C` whose concepts agree with `S` on every observed point — the
classical "concepts still consistent with the data" of Mitchell (1977) and
Angluin (1980).

## Main definitions

- `VersionSpace C S`: the subset of `C` whose concepts agree with `S` on every
sample point.
- `IsConsistent A C`: a learner is consistent with `C` if its output always lies
in the version space at the received sample.
- `empiricalMiscount h S`: number of sample points where `h` errs (`[DecidableEq β]`).
- `empiricalMeasure S`: the uniform Dirac mixture over the sample.
- `empiricalError h S`: the empirical distribution's mass on the disagreement set.
- `Realizable C S`: some concept in `C` labels every sample point correctly.

## Main results

- `versionSpace_subset`, `versionSpace_empty_sample`, `versionSpace_reindex`,
`versionSpace_antitone`, `versionSpace_mono_C`: structural properties.
- `mem_versionSpace_iff_empiricalMiscount_zero`: combinatorial bridge.
- `mem_versionSpace_iff_empiricalError_zero`: measure-theoretic bridge.
- `IsConsistent.empiricalMiscount_eq_zero`, `IsConsistent.empiricalError_eq_zero`:
consistent learners achieve zero error / miscount on every sample.
- `mem_versionSpace_of_realizable`, `Realizable.versionSpace_nonempty`: realizable
samples give non-empty version spaces.
- `ae_mem_versionSpace_of_realizable`: under iid sampling from a realizable
joint distribution, the target concept lies in the version space almost surely.

## References

* [Mitchell1977]
* [Mitchell1982]
* [Angluin1980]
* [Mitchell1997]
-/

@[expose] public section

open MeasureTheory Set
open scoped ENNReal

namespace Cslib.MachineLearning.PACLearning

variable {α : Type*} {β : Type*}

/-! ### Version Space -/

/-- The *version space* of a concept class `C` given a labeled sample `S`:
the set of concepts in `C` whose labels agree with `S` on every observed point. -/
def VersionSpace {m : ℕ} (C : ConceptClass α β) (S : LabeledSample α β m) :
ConceptClass α β :=
{h ∈ C | ∀ i : Fin m, h (S i).1 = (S i).2}

/-- Membership in the version space unfolds to concept membership plus
per-sample consistency. -/
theorem mem_versionSpace_iff {m : ℕ} {C : ConceptClass α β}
{S : LabeledSample α β m} {h : α → β} :
h ∈ VersionSpace C S ↔ h ∈ C ∧ ∀ i : Fin m, h (S i).1 = (S i).2 := Iff.rfl

/-- The version space is a subset of the original concept class. -/
theorem versionSpace_subset {m : ℕ} (C : ConceptClass α β)
(S : LabeledSample α β m) :
VersionSpace C S ⊆ C := fun _ hh => hh.1

/-- Version space on the empty sample equals the whole concept class. -/
theorem versionSpace_empty_sample (C : ConceptClass α β)
(S : LabeledSample α β 0) :
VersionSpace C S = C := by
ext h
refine ⟨fun hh => hh.1, fun hh => ⟨hh, fun i => i.elim0⟩⟩

/-- *Version space reindexing.* For any reindexing `f : Fin m → Fin n`, the
version space on `S` is contained in the version space on the reindexed sample
`S ∘ f`. -/
theorem versionSpace_reindex {m n : ℕ} (f : Fin m → Fin n) (C : ConceptClass α β)
(S : LabeledSample α β n) :
VersionSpace C S ⊆ VersionSpace C (S ∘ f) :=
fun _ hh => ⟨hh.1, fun i => hh.2 (f i)⟩

/-- *Version space antitonicity.* Given a sample of size `n` and `m ≤ n`, the
version space on all `n` observations is a subset of the version space on the
first `m` observations. Special case of `versionSpace_reindex` with
`f := Fin.castLE hmn`. -/
theorem versionSpace_antitone {m n : ℕ} (hmn : m ≤ n) (C : ConceptClass α β)
(S : LabeledSample α β n) :
VersionSpace C S ⊆ VersionSpace C (S ∘ Fin.castLE hmn) :=
versionSpace_reindex (Fin.castLE hmn) C S

/-- *Version space is monotone in the concept class.* -/
theorem versionSpace_mono_C {m : ℕ} {C C' : ConceptClass α β} (hCC' : C ⊆ C')
(S : LabeledSample α β m) :
VersionSpace C S ⊆ VersionSpace C' S :=
fun _ hh => ⟨hCC' hh.1, hh.2⟩

/-! ### Empirical Error -/

/-- The *empirical miscount* of a hypothesis `h` on a labeled sample `S`: the
number of sample points where `h` predicts incorrectly. -/
def empiricalMiscount [DecidableEq β] {m : ℕ} (h : α → β)
(S : LabeledSample α β m) : ℕ :=
(Finset.univ.filter fun i : Fin m => h (S i).1 ≠ (S i).2).card

section EmpiricalMeasure
variable [MeasurableSpace α] [MeasurableSpace β]

/-- The *empirical distribution* of a labeled sample: the uniform mixture of
Dirac measures at each sample point. Equals the zero measure when `m = 0`. -/
noncomputable def empiricalMeasure {m : ℕ} (S : LabeledSample α β m) :
Measure (α × β) :=
if _hm : m = 0 then 0
else (m : ℝ≥0∞)⁻¹ • ∑ i, Measure.dirac (S i)

/-- The *empirical 0-1 error* of `h` on `S`: the empirical distribution's
mass on the disagreement set. -/
noncomputable def empiricalError {m : ℕ} (h : α → β) (S : LabeledSample α β m) :
ℝ≥0∞ :=
error (empiricalMeasure S) h

end EmpiricalMeasure

/-- Version-space membership equals concept-class membership plus zero empirical
miscount (combinatorial bridge). -/
theorem mem_versionSpace_iff_empiricalMiscount_zero [DecidableEq β]
{m : ℕ} {C : ConceptClass α β} {S : LabeledSample α β m} {h : α → β} :
h ∈ VersionSpace C S ↔ h ∈ C ∧ empiricalMiscount h S = 0 := by
simp only [empiricalMiscount, Finset.card_eq_zero, Finset.filter_eq_empty_iff,
Finset.mem_univ, true_implies, ne_eq, Decidable.not_not]
rfl

/-- Version-space membership equals concept-class membership plus zero empirical
error (measure-theoretic bridge). -/
theorem mem_versionSpace_iff_empiricalError_zero
[MeasurableSpace α] [MeasurableSpace β]
[MeasurableSingletonClass α] [MeasurableSingletonClass β]
{m : ℕ} {C : ConceptClass α β} {S : LabeledSample α β m} {h : α → β} :
h ∈ VersionSpace C S ↔ h ∈ C ∧ empiricalError h S = 0 := by
refine and_congr_right fun _ => ?_
unfold empiricalError empiricalMeasure error
rcases Nat.eq_zero_or_pos m with hm | hm
· subst hm
rw [dif_pos rfl]
simp only [Measure.coe_zero, Pi.zero_apply]
exact iff_of_true (fun i => i.elim0) trivial
· have hm_ne : m ≠ 0 := Nat.pos_iff_ne_zero.mp hm
have hm_inv_ne : (m : ℝ≥0∞)⁻¹ ≠ 0 :=
ENNReal.inv_ne_zero.mpr (ENNReal.natCast_ne_top m)
rw [dif_neg hm_ne, Measure.smul_apply, Measure.finsetSum_apply]
simp only [Measure.dirac_apply, Set.indicator, Set.mem_setOf_eq, Pi.one_apply,
smul_eq_mul]
rw [mul_eq_zero]
constructor
· intro hh
right
apply Finset.sum_eq_zero
intro i _
rw [if_neg]
intro hne
exact hne (hh i)
· rintro (h1 | h2)
· exact absurd h1 hm_inv_ne
· intro i
have hi := (Finset.sum_eq_zero_iff.mp h2) i (Finset.mem_univ i)
by_contra hne
rw [if_pos hne] at hi
exact one_ne_zero hi

/-! ### Consistent Learners -/

/-- A learner is *consistent* with the concept class `C` if, on every labeled
sample it receives, its output hypothesis lies in the version space of `C` at
that sample — i.e. the output is in `C` and agrees with every observed
labeled pair. -/
def IsConsistent {m : ℕ} (A : Learner α β m) (C : ConceptClass α β) : Prop :=
∀ S : LabeledSample α β m, A S ∈ VersionSpace C S

/-- A consistent learner's output is always in the concept class. -/
theorem IsConsistent.output_mem_conceptClass {m : ℕ} {A : Learner α β m}
{C : ConceptClass α β} (hA : IsConsistent A C) (S : LabeledSample α β m) :
A S ∈ C := (hA S).1

/-- A consistent learner's output agrees with the sample on every observed
point. -/
theorem IsConsistent.output_agrees {m : ℕ} {A : Learner α β m}
{C : ConceptClass α β} (hA : IsConsistent A C) (S : LabeledSample α β m)
(i : Fin m) :
A S (S i).1 = (S i).2 := (hA S).2 i

/-- A consistent learner has zero empirical miscount on every sample. -/
theorem IsConsistent.empiricalMiscount_eq_zero [DecidableEq β]
{m : ℕ} {A : Learner α β m} {C : ConceptClass α β} (hA : IsConsistent A C)
(S : LabeledSample α β m) :
empiricalMiscount (A S) S = 0 :=
(mem_versionSpace_iff_empiricalMiscount_zero.mp (hA S)).2

/-- A consistent learner has zero empirical error on every sample. -/
theorem IsConsistent.empiricalError_eq_zero
[MeasurableSpace α] [MeasurableSpace β]
[MeasurableSingletonClass α] [MeasurableSingletonClass β]
{m : ℕ} {A : Learner α β m} {C : ConceptClass α β}
(hA : IsConsistent A C) (S : LabeledSample α β m) :
empiricalError (A S) S = 0 :=
(mem_versionSpace_iff_empiricalError_zero.mp (hA S)).2

/-! ### Realizable case -/

/-- A labeled sample `S` is *realizable* by concept class `C` if some concept
in `C` labels every sample point correctly. -/
def Realizable {m : ℕ} (C : ConceptClass α β) (S : LabeledSample α β m) : Prop :=
∃ c ∈ C, ∀ i : Fin m, (S i).2 = c (S i).1

/-- *Realizable version-space nonemptiness.* If a target concept `c` lies in
`C` and the sample `S` is labeled by `c` (i.e. every `(S i).2 = c (S i).1`),
then `c` itself lies in the version space `VersionSpace C S`. -/
theorem mem_versionSpace_of_realizable {m : ℕ} {C : ConceptClass α β}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets take c implicit here to match the following theorem.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also think something like:

  theorem ae_mem_versionSpace_of_realizable
      [MeasurableSpace α] [MeasurableSpace β]
      {C : ConceptClass α β} {c : α → β} (hc : c ∈ C) (hcm : Measurable c)
      (P : Measure α) [IsProbabilityMeasure P] (m : ℕ) :
      ∀ᵐ S : LabeledSample α β m
        ∂(Measure.pi (fun _ : Fin m => P.map (fun x => (x, c x)))),
        c ∈ VersionSpace C S := ...

might be desirable here?

{c : α → β} (hc : c ∈ C) (S : LabeledSample α β m)
(hS : ∀ i : Fin m, (S i).2 = c (S i).1) :
c ∈ VersionSpace C S :=
⟨hc, fun i => (hS i).symm⟩

/-- A realizable sample has nonempty version space. -/
theorem Realizable.versionSpace_nonempty {m : ℕ} {C : ConceptClass α β}
{S : LabeledSample α β m} (h : Realizable C S) :
(VersionSpace C S).Nonempty :=
⟨h.choose, mem_versionSpace_of_realizable h.choose_spec.1 S h.choose_spec.2⟩

/-! ### Probabilistic Realizable -/

/-- Under the pushforward of a probability measure `P` along the graph map
`x ↦ (x, c x)`, the graph of `c` has measure `1`. -/
private lemma map_graph_eq_one
[MeasurableSpace α] [MeasurableSpace β]
{c : α → β} (hcm : Measurable c) (P : Measure α) [IsProbabilityMeasure P]
(hG : MeasurableSet {p : α × β | p.2 = c p.1}) :
(P.map (fun x => (x, c x))) {p : α × β | p.2 = c p.1} = 1 := by
have hφ : Measurable (fun x : α => (x, c x)) := by fun_prop
rw [Measure.map_apply hφ hG]
have hpre : (fun x : α => (x, c x)) ⁻¹' {p : α × β | p.2 = c p.1} = Set.univ := by
ext x; simp
rw [hpre, measure_univ]

/-- The iid product of the realizable joint distribution assigns measure `1`
to the set of samples where every coordinate lies on the graph of `c`. -/
private lemma pi_map_graph_eq_one
[MeasurableSpace α] [MeasurableSpace β]
{c : α → β} (hcm : Measurable c) (P : Measure α) [IsProbabilityMeasure P]
(hG : MeasurableSet {p : α × β | p.2 = c p.1}) {m : ℕ} :
(Measure.pi (fun _ : Fin m => P.map (fun x => (x, c x))))
(Set.univ.pi (fun _ : Fin m => {p : α × β | p.2 = c p.1})) = 1 := by
have hφ : Measurable (fun x : α => (x, c x)) := by fun_prop
haveI : IsProbabilityMeasure (P.map (fun x : α => (x, c x))) :=
Measure.isProbabilityMeasure_map hφ.aemeasurable
rw [Measure.pi_pi]
simp [map_graph_eq_one hcm P hG]

/-- Under iid sampling from the realizable joint distribution induced by
`c ∈ C` and a probability measure `P` on `α`, the target concept `c` lies in
the version space almost surely. -/
theorem ae_mem_versionSpace_of_realizable
[MeasurableSpace α] [MeasurableSpace β]
{C : ConceptClass α β} {c : α → β} (hc : c ∈ C) (hcm : Measurable c)
(hG : MeasurableSet {p : α × β | p.2 = c p.1})
(P : Measure α) [IsProbabilityMeasure P] (m : ℕ) :
∀ᵐ S : LabeledSample α β m
∂(Measure.pi (fun _ : Fin m => P.map (fun x => (x, c x)))),
c ∈ VersionSpace C S := by
have hφ : Measurable (fun x : α => (x, c x)) := by fun_prop
haveI : IsProbabilityMeasure (P.map (fun x : α => (x, c x))) :=
Measure.isProbabilityMeasure_map hφ.aemeasurable
rw [ae_iff]
have hsub : {S : Fin m → α × β | ¬ c ∈ VersionSpace C S} ⊆
(Set.univ.pi (fun _ : Fin m => {p : α × β | p.2 = c p.1}))ᶜ := by
intro S hS hcontra
simp only [Set.mem_pi, Set.mem_univ, true_implies, Set.mem_setOf_eq] at hcontra
exact hS ⟨hc, fun i => (hcontra i).symm⟩
have hcompl : (Measure.pi (fun _ : Fin m => P.map (fun x : α => (x, c x))))
((Set.univ.pi (fun _ : Fin m => {p : α × β | p.2 = c p.1}))ᶜ) = 0 := by
rw [prob_compl_eq_one_sub (MeasurableSet.univ_pi fun _ => hG),
pi_map_graph_eq_one hcm P hG, tsub_self]
exact measure_mono_null hsub hcompl

end Cslib.MachineLearning.PACLearning
39 changes: 39 additions & 0 deletions references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -370,3 +370,42 @@ @incollection{WinskelNielsen1995
url = {https://doi.org/10.1093/oso/9780198537809.003.0001},
eprint = {https://academic.oup.com/book/0/chapter/421962123/chapter-pdf/52352653/isbn-9780198537809-book-part-1.pdf},
}

@inproceedings{Mitchell1977,
author = {Mitchell, Tom M.},
title = {Version Spaces: A Candidate Elimination Approach to Rule Learning},
booktitle = {Proceedings of the 5th International Joint Conference on Artificial Intelligence},
volume = {1},
pages = {305--310},
year = {1977}
}

@article{Mitchell1982,
author = {Mitchell, Tom M.},
title = {Generalization as Search},
journal = {Artificial Intelligence},
volume = {18},
number = {2},
pages = {203--226},
year = {1982},
doi = {10.1016/0004-3702(82)90040-6}
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should cite "Version Spaces: A Candidate Elimination Approach to Rule Learning" instead, or at least in addition.


@article{Angluin1980,
author = {Angluin, Dana},
title = {Inductive Inference of Formal Languages from Positive Data},
journal = {Information and Control},
volume = {45},
number = {2},
pages = {117--135},
year = {1980},
doi = {10.1016/S0019-9958(80)90285-5}
}

@book{Mitchell1997,
author = {Mitchell, Tom M.},
title = {Machine Learning},
year = {1997},
publisher = {McGraw-Hill},
isbn = {0070428077}
}