nicka360 commited on
Commit
80e0dcf
·
1 Parent(s): 008c49a

Add warp.data.ImageLoader for HF Space runtime

Browse files
.gitignore CHANGED
@@ -1,7 +1,6 @@
1
 
2
  # keep warp lightweight
3
  warp/inference_outputs/
4
- warp/data/
5
  warp/**/checkpoints/
6
  warp/**/*.pt
7
  warp/**/*.pth
@@ -12,3 +11,17 @@ warp/**/*.npz
12
  *.zip
13
  *.7z
14
  *.tar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  # keep warp lightweight
3
  warp/inference_outputs/
 
4
  warp/**/checkpoints/
5
  warp/**/*.pt
6
  warp/**/*.pth
 
11
  *.zip
12
  *.7z
13
  *.tar
14
+
15
+ # keep warp.data lightweight in Space
16
+ warp/data/**/images/
17
+ warp/data/**/datasets/
18
+ warp/data/**/benchmarks/
19
+ warp/data/**/*.pt
20
+ warp/data/**/*.pth
21
+ warp/data/**/*.ckpt
22
+ warp/data/**/*.safetensors
23
+ warp/data/**/*.bin
24
+ warp/data/**/*.npz
25
+ warp/data/**/*.zip
26
+ warp/data/**/*.7z
27
+ warp/data/**/*.tar
warp/data/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Data management for A360 WARP."""
2
+
3
+ from .image_loader import ImageLoader, list_practices
4
+
5
+ __all__ = ["ImageLoader", "list_practices"]
warp/data/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (360 Bytes). View file
 
warp/data/__pycache__/image_loader.cpython-313.pyc ADDED
Binary file (9.18 kB). View file
 
warp/data/image_loader.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image loader for scraped medical practice images."""
2
+
3
+ from pathlib import Path
4
+
5
+ from PIL import Image
6
+
7
+ # Default scraped images directory (now under top-level data/scrapedimages)
8
+ DEFAULT_SCRAPED_IMAGES_DIR = Path(__file__).parent.parent.parent / "data" / "scrapedimages"
9
+
10
+
11
+ class ImageLoader:
12
+ """Load and manage scraped medical practice images.
13
+
14
+ Attributes:
15
+ base_path: Root directory containing scraped images organized by practice
16
+ practices: List of available practice directories
17
+ """
18
+
19
+ def __init__(self, base_path: Path | str | None = None):
20
+ """Initialize the ImageLoader.
21
+
22
+ Args:
23
+ base_path: Root directory containing scraped images.
24
+ Defaults to project's scrapedimages folder.
25
+ """
26
+ if base_path is None:
27
+ self.base_path = DEFAULT_SCRAPED_IMAGES_DIR
28
+ else:
29
+ self.base_path = Path(base_path)
30
+
31
+ if not self.base_path.exists():
32
+ raise ValueError(f"Image directory does not exist: {self.base_path}")
33
+
34
+ self._practices: list[str] | None = None
35
+
36
+ @property
37
+ def practices(self) -> list[str]:
38
+ """Get list of available practice directories.
39
+
40
+ Returns:
41
+ List of practice directory names (e.g., ['drleedy.com', 'drbirely.com'])
42
+ """
43
+ if self._practices is None:
44
+ self._practices = sorted(
45
+ [
46
+ d.name
47
+ for d in self.base_path.iterdir()
48
+ if d.is_dir() and not d.name.startswith(".")
49
+ ]
50
+ )
51
+ return self._practices
52
+
53
+ def get_practice_path(self, practice_name: str) -> Path:
54
+ """Get the full path to a practice directory.
55
+
56
+ Args:
57
+ practice_name: Name of the practice (e.g., 'drleedy.com')
58
+
59
+ Returns:
60
+ Path object pointing to the practice directory
61
+
62
+ Raises:
63
+ ValueError: If practice does not exist
64
+ """
65
+ practice_path = self.base_path / practice_name
66
+ if not practice_path.exists():
67
+ raise ValueError(
68
+ f"Practice '{practice_name}' not found. "
69
+ f"Available practices: {', '.join(self.practices)}"
70
+ )
71
+ return practice_path
72
+
73
+ def list_images(self, practice_name: str, extensions: list[str] | None = None) -> list[Path]:
74
+ """List all images for a given practice.
75
+
76
+ Args:
77
+ practice_name: Name of the practice
78
+ extensions: List of file extensions to filter (e.g., ['.jpg', '.png'])
79
+ If None, includes common image formats
80
+
81
+ Returns:
82
+ List of Path objects for all matching images
83
+ """
84
+ if extensions is None:
85
+ extensions = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]
86
+
87
+ practice_path = self.get_practice_path(practice_name)
88
+ images: list[Path] = []
89
+
90
+ for ext in extensions:
91
+ images.extend(practice_path.glob(f"**/*{ext}"))
92
+ images.extend(practice_path.glob(f"**/*{ext.upper()}"))
93
+
94
+ return sorted(images)
95
+
96
+ def count_images(self, practice_name: str) -> int:
97
+ """Count total images for a practice.
98
+
99
+ Args:
100
+ practice_name: Name of the practice
101
+
102
+ Returns:
103
+ Number of images
104
+ """
105
+ return len(self.list_images(practice_name))
106
+
107
+ def load_image(self, image_path: Path | str) -> Image.Image:
108
+ """Load a single image.
109
+
110
+ Args:
111
+ image_path: Path to the image file
112
+
113
+ Returns:
114
+ PIL Image object
115
+
116
+ Raises:
117
+ FileNotFoundError: If image does not exist
118
+ """
119
+ image_path = Path(image_path)
120
+ if not image_path.exists():
121
+ raise FileNotFoundError(f"Image not found: {image_path}")
122
+
123
+ return Image.open(image_path)
124
+
125
+ def get_image_info(self, image_path: Path | str) -> dict:
126
+ """Get metadata about an image.
127
+
128
+ Args:
129
+ image_path: Path to the image file
130
+
131
+ Returns:
132
+ Dictionary with image metadata (size, format, mode, etc.)
133
+ """
134
+ image_path = Path(image_path)
135
+ img = self.load_image(image_path)
136
+
137
+ return {
138
+ "path": str(image_path),
139
+ "filename": image_path.name,
140
+ "practice": (
141
+ image_path.parent.name if image_path.is_relative_to(self.base_path) else None
142
+ ),
143
+ "size": img.size,
144
+ "width": img.width,
145
+ "height": img.height,
146
+ "format": img.format,
147
+ "mode": img.mode,
148
+ "file_size_bytes": image_path.stat().st_size,
149
+ }
150
+
151
+ def get_random_images(
152
+ self, practice_name: str, n: int = 5, seed: int | None = None
153
+ ) -> list[Path]:
154
+ """Get random sample of images from a practice.
155
+
156
+ Args:
157
+ practice_name: Name of the practice
158
+ n: Number of images to return
159
+ seed: Random seed for reproducibility
160
+
161
+ Returns:
162
+ List of n random image paths
163
+ """
164
+ import random
165
+
166
+ images = self.list_images(practice_name)
167
+
168
+ if seed is not None:
169
+ random.seed(seed)
170
+
171
+ return random.sample(images, min(n, len(images)))
172
+
173
+ def get_practice_stats(self, practice_name: str) -> dict:
174
+ """Get statistics for a practice's images.
175
+
176
+ Args:
177
+ practice_name: Name of the practice
178
+
179
+ Returns:
180
+ Dictionary with practice statistics
181
+ """
182
+ images = self.list_images(practice_name)
183
+ total_size = sum(img.stat().st_size for img in images)
184
+
185
+ # Get format distribution
186
+ formats: dict[str, int] = {}
187
+ for img_path in images:
188
+ ext = img_path.suffix.lower()
189
+ formats[ext] = formats.get(ext, 0) + 1
190
+
191
+ return {
192
+ "practice": practice_name,
193
+ "total_images": len(images),
194
+ "total_size_mb": total_size / (1024 * 1024),
195
+ "formats": formats,
196
+ "practice_path": str(self.get_practice_path(practice_name)),
197
+ }
198
+
199
+ def get_all_stats(self) -> dict:
200
+ """Get statistics for all practices.
201
+
202
+ Returns:
203
+ Dictionary with overall statistics
204
+ """
205
+ all_stats: dict = {"practices": {}, "total_images": 0, "total_size_mb": 0.0}
206
+
207
+ for practice in self.practices:
208
+ practice_stats = self.get_practice_stats(practice)
209
+ all_stats["practices"][practice] = practice_stats
210
+ all_stats["total_images"] += practice_stats["total_images"]
211
+ all_stats["total_size_mb"] += practice_stats["total_size_mb"]
212
+
213
+ return all_stats
214
+
215
+
216
+ def list_practices(base_path: Path | str | None = None) -> list[str]:
217
+ """Convenience function to list all available practices.
218
+
219
+ Args:
220
+ base_path: Root directory containing scraped images.
221
+ Defaults to project's scrapedimages folder.
222
+
223
+ Returns:
224
+ List of practice directory names
225
+ """
226
+ loader = ImageLoader(base_path)
227
+ return loader.practices