Language statistics bar for repositories (#8037)
* Implementation for calculating language statistics Impement saving code language statistics to database Implement rendering langauge stats Add primary laguage to show in repository list Implement repository stats indexer queue Add indexer test Refactor to use queue module * Do not timeout for queues
This commit is contained in:
parent
37892be635
commit
ad2642a8aa
89 changed files with 182950 additions and 57 deletions
107
vendor/github.com/src-d/enry/v2/classifier.go
generated
vendored
Normal file
107
vendor/github.com/src-d/enry/v2/classifier.go
generated
vendored
Normal file
|
@ -0,0 +1,107 @@
|
|||
package enry
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/src-d/enry/v2/internal/tokenizer"
|
||||
)
|
||||
|
||||
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
|
||||
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
|
||||
type Classifier interface {
|
||||
Classify(content []byte, candidates map[string]float64) (languages []string)
|
||||
}
|
||||
|
||||
type classifier struct {
|
||||
languagesLogProbabilities map[string]float64
|
||||
tokensLogProbabilities map[string]map[string]float64
|
||||
tokensTotal float64
|
||||
}
|
||||
|
||||
type scoredLanguage struct {
|
||||
language string
|
||||
score float64
|
||||
}
|
||||
|
||||
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
||||
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
|
||||
|
||||
var languages map[string]float64
|
||||
if len(candidates) == 0 {
|
||||
languages = c.knownLangs()
|
||||
} else {
|
||||
languages = make(map[string]float64, len(candidates))
|
||||
for candidate, weight := range candidates {
|
||||
if lang, ok := GetLanguageByAlias(candidate); ok {
|
||||
candidate = lang
|
||||
}
|
||||
|
||||
languages[candidate] = weight
|
||||
}
|
||||
}
|
||||
|
||||
empty := len(content) == 0
|
||||
scoredLangs := make([]*scoredLanguage, 0, len(languages))
|
||||
|
||||
var tokens []string
|
||||
if !empty {
|
||||
tokens = tokenizer.Tokenize(content)
|
||||
}
|
||||
|
||||
for language := range languages {
|
||||
score := c.languagesLogProbabilities[language]
|
||||
if !empty {
|
||||
score += c.tokensLogProbability(tokens, language)
|
||||
}
|
||||
scoredLangs = append(scoredLangs, &scoredLanguage{
|
||||
language: language,
|
||||
score: score,
|
||||
})
|
||||
}
|
||||
|
||||
return sortLanguagesByScore(scoredLangs)
|
||||
}
|
||||
|
||||
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
|
||||
sort.Stable(byScore(scoredLangs))
|
||||
sortedLanguages := make([]string, 0, len(scoredLangs))
|
||||
for _, scoredLang := range scoredLangs {
|
||||
sortedLanguages = append(sortedLanguages, scoredLang.language)
|
||||
}
|
||||
|
||||
return sortedLanguages
|
||||
}
|
||||
|
||||
func (c *classifier) knownLangs() map[string]float64 {
|
||||
langs := make(map[string]float64, len(c.languagesLogProbabilities))
|
||||
for lang := range c.languagesLogProbabilities {
|
||||
langs[lang]++
|
||||
}
|
||||
|
||||
return langs
|
||||
}
|
||||
|
||||
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
|
||||
var sum float64
|
||||
for _, token := range tokens {
|
||||
sum += c.tokenProbability(token, language)
|
||||
}
|
||||
|
||||
return sum
|
||||
}
|
||||
|
||||
func (c *classifier) tokenProbability(token, language string) float64 {
|
||||
tokenProb, ok := c.tokensLogProbabilities[language][token]
|
||||
if !ok {
|
||||
tokenProb = math.Log(1.000000 / c.tokensTotal)
|
||||
}
|
||||
|
||||
return tokenProb
|
||||
}
|
||||
|
||||
type byScore []*scoredLanguage
|
||||
|
||||
func (b byScore) Len() int { return len(b) }
|
||||
func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
||||
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }
|
Loading…
Add table
Add a link
Reference in a new issue