richardyoung commited on
Commit
bf89a26
·
verified ·
1 Parent(s): ea7b37a

Initial upload: Best performing cardiology embedding model (separation: 0.510)

Browse files
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: michiyasunaga/BioLinkBERT-large
4
+ tags:
5
+ - medical
6
+ - cardiology
7
+ - embeddings
8
+ - domain-adaptation
9
+ - lora
10
+ - sentence-transformers
11
+ - sentence-similarity
12
+ language:
13
+ - en
14
+ license: apache-2.0
15
+ ---
16
+
17
+ # CardioEmbed-BioLinkBERT
18
+
19
+ **Domain-specialized cardiology text embeddings using LoRA-adapted BioLinkBERT-large**
20
+
21
+ This is the **best performing model** from our comparative study of 10 embedding architectures for clinical cardiology.
22
+
23
+ ## Performance
24
+
25
+ | Metric | Score |
26
+ |--------|-------|
27
+ | Separation Score | **0.510** |
28
+ | Similar Pair Avg | 0.811 |
29
+ | Different Pair Avg | 0.301 |
30
+ | Throughput | 143.5 emb/sec |
31
+ | Memory | 1.51 GB |
32
+
33
+ ## Usage
34
+
35
+ ```python
36
+ from transformers import AutoModel, AutoTokenizer
37
+ from peft import PeftModel
38
+
39
+ # Load base model
40
+ base_model = AutoModel.from_pretrained("michiyasunaga/BioLinkBERT-large")
41
+ tokenizer = AutoTokenizer.from_pretrained("michiyasunaga/BioLinkBERT-large")
42
+
43
+ # Load LoRA adapter
44
+ model = PeftModel.from_pretrained(base_model, "richardyoung/CardioEmbed-BioLinkBERT")
45
+
46
+ # Generate embeddings
47
+ text = "Atrial fibrillation with rapid ventricular response"
48
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
49
+ outputs = model(**inputs)
50
+ embeddings = outputs.last_hidden_state.mean(dim=1)
51
+ ```
52
+
53
+ ## Training
54
+
55
+ - **Training Data**: 106,535 cardiology text pairs from medical textbooks
56
+ - **Method**: LoRA fine-tuning (r=16, alpha=32)
57
+ - **Loss**: Multiple Negatives Ranking Loss (InfoNCE)
58
+
59
+ ## Citation
60
+
61
+ ```bibtex
62
+ @article{young2024comparative,
63
+ title={Comparative Analysis of LoRA-Adapted Embedding Models for Clinical Cardiology Text Representation},
64
+ author={Young, Richard J and Matthews, Alice M},
65
+ journal={arXiv preprint},
66
+ year={2024}
67
+ }
68
+ ```
69
+
70
+ ## Related Models
71
+
72
+ This is part of the CardioEmbed model family. See [richardyoung/CardioEmbed](https://huggingface.co/richardyoung/CardioEmbed) for more models.
adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "michiyasunaga/BioLinkBERT-large",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "value",
33
+ "key",
34
+ "query"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "FEATURE_EXTRACTION",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93a2689f97321200ced08a2389c4fcf79337df643456168667469041de7faddf
3
+ size 9456904
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": null,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
training_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "BioLinkBERT-large",
3
+ "model_id": "michiyasunaga/BioLinkBERT-large",
4
+ "batch_size": 64,
5
+ "gradient_accumulation_steps": 1,
6
+ "learning_rate": 2e-05,
7
+ "epochs": 3,
8
+ "max_seq_length": 512,
9
+ "lora_r": 16,
10
+ "lora_alpha": 32,
11
+ "use_8bit": false,
12
+ "target_modules": [
13
+ "query",
14
+ "key",
15
+ "value"
16
+ ],
17
+ "output_dir": "biolinkbert_cardiology"
18
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff