#腾讯云AI代码助手# 无须自己动手,使用腾讯云AI代码助手帮我实现实现自动扒取数据的需求,简单又快捷,自己手动小改下就能用了,给#腾讯云AI代码助手# 点赞!

package main
import (
"encoding/json"
"fmt"
"log"
"net/http"
"sort"
"strings"
"time"
)
const (
limit = 100
maxResults = 5000
)
type Model struct {
ID string `json:"modelId"`
Downloads int `json:"downloads"`
License string `json:"-"`
CardData struct {
License string `json:"license"`
} `json:"cardData"`
Tags []string `json:"tags"`
}
type Dataset struct {
ID string `json:"id"`
Downloads int `json:"downloads"`
License string `json:"-"`
CardData struct {
License string `json:"license"`
} `json:"cardData"`
Tags []string `json:"tags"`
}
type Result struct {
ID string
Type string
Downloads int
}
func main() {
results := make(chan Result)
defer close(results)
go fetchModels(results)
go fetchDatasets(results)
var items []Result
for item := range results {
items = append(items, item)
if len(items) >= maxResults*2 { // 双倍缓冲
break
}
}
sort.Slice(items, func(i, j int) bool {
return items[i].Downloads > items[j].Downloads
})
if len(items) > maxResults {
items = items[:maxResults]
}
for _, item := range items {
fmt.Printf("%s (%s) - Downloads: %d\n", item.ID, item.Type, item.Downloads)
}
}
func fetchModels(results chan<- Result) {
client := &http.Client{Timeout: 10 * time.Second}
for offset := 0; ; offset += limit {
url := fmt.Sprintf("https://huggingface.co/api/models?search=license:mit+OR+license:apache-2.0&sort=downloads&direction=-1&limit=%d&offset=%d",
limit, offset)
resp, err := client.Get(url)
if handleError(err, "models") {
break
}
var models []Model
if err := json.NewDecoder(resp.Body).Decode(&models); handleError(err, "models") {
resp.Body.Close()
break
}
resp.Body.Close()
if len(models) == 0 {
break
}
for _, m := range models {
m.extractLicense()
if isValidLicense(m.License) {
results <- Result{
ID: m.ID,
Type: "model",
Downloads: m.Downloads,
}
}
}
time.Sleep(500 * time.Millisecond)
}
}
func fetchDatasets(results chan<- Result) {
client := &http.Client{Timeout: 10 * time.Second}
for offset := 0; ; offset += limit {
url := fmt.Sprintf("https://huggingface.co/api/datasets?search=license:mit+OR+license:apache-2.0&sort=downloads&direction=-1&limit=%d&offset=%d",
limit, offset)
resp, err := client.Get(url)
if handleError(err, "datasets") {
break
}
var datasets []Dataset
if err := json.NewDecoder(resp.Body).Decode(&datasets); handleError(err, "datasets") {
resp.Body.Close()
break
}
resp.Body.Close()
if len(datasets) == 0 {
break
}
for _, d := range datasets {
d.extractLicense()
if isValidLicense(d.License) {
results <- Result{
ID: d.ID,
Type: "dataset",
Downloads: d.Downloads,
}
}
}
time.Sleep(500 * time.Millisecond)
}
}
func (m *Model) extractLicense() {
m.License = strings.ToLower(m.CardData.License)
if m.License == "" {
for _, tag := range m.Tags {
if strings.HasPrefix(tag, "license:") {
m.License = strings.ToLower(strings.TrimPrefix(tag, "license:"))
break
}
}
}
}
func (d *Dataset) extractLicense() {
d.License = strings.ToLower(d.CardData.License)
if d.License == "" {
for _, tag := range d.Tags {
if strings.HasPrefix(tag, "license:") {
d.License = strings.ToLower(strings.TrimPrefix(tag, "license:"))
break
}
}
}
}
func isValidLicense(license string) bool {
return license == "mit" || license == "apache-2.0"
}
func handleError(err error, resource string) bool {
if err != nil {
log.Printf("Error fetching %s: %v", resource, err)
return true
}
return false
}原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。