!2










!3




!4
w {s1, . . . , sn}
{i1, . . . , in}
w = s1[i1] = . . . = sn[in]
fl(s, i) s[i]
l
SelfSiml(w) =
1
n2 − n ∑
j
∑
k≠j
cos
(
fl (sj, ij), fl (sk, ik))






!5
s ⟨w1, . . . , wn⟩
fl(s, i) s[i]
l
IntraSiml(s) =
1
n ∑
i
cos ( ⃗sl, fl(s, i))
where ⃗sl =
1
n ∑
i
fl(s, i)






!6
w {s1, . . . , sn}
{i1, . . . , in}
w = s1[i1] = . . . = sn[in]
fl(s, i) s[i] l
MEVl(w) =
σ2
1
∑i
σ2
i
[fl(s1, i1) . . . fl(sn, in)]
σ1 . . . σm




!7
Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]
𝒪
fl( ⋅ )
l




!8
Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]
𝒪
fl( ⋅ )










SelfSiml(w) = 0.95
Baseline(fl) = 0.00
Baseline(fl) = 0.99
Baseline(fl) = 0.00
Baseline(fl) = 0.99




!9
Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]
𝒪
fl( ⋅ )










SelfSiml(w) = 0.95
Baseline(fl) = 0.00
Baseline(fl) = 0.99
Baseline(fl) = 0.00
Baseline(fl) = 0.99






!10
Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]
𝒪
fl( ⋅ )




Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]
SelfSim*l
(w) = SelfSiml(w) − Baseline (fl)
!11
Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]
!12
Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]
!13
Baseline (fl) = 𝔼x,y∼U(𝒪) [cos (fl(x), fl(y))]


!14
SelfSiml(w) =
1
n2 − n ∑
j
∑
k≠j
cos
(
fl (sj, ij), fl (sk, ik))


!15
SelfSiml(w) =
1
n2 − n ∑
j
∑
k≠j
cos
(
fl (sj, ij), fl (sk, ik))


!16
SelfSiml(w) =
1
n2 − n ∑
j
∑
k≠j
cos
(
fl (sj, ij), fl (sk, ik))








!17
IntraSiml(s) =
1
n ∑
i
cos ( ⃗sl, fl(s, i))
where ⃗sl =
1
n ∑
i
fl(s, i)


!18
IntraSiml(s) =
1
n ∑
i
cos ( ⃗sl, fl(s, i))
where ⃗sl =
1
n ∑
i
fl(s, i)










!19
MEVl(w) =
σ2
1
∑i
σ2
i


!20
MEVl(w) =
σ2
1
∑i
σ2
i




!21
MEVl(w) =
σ2
1
∑i
σ2
i






!22


!23
!24

How Contextual are Contextualized Word Representations?