1 /* This file is part of the Vc library.
3 Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
5 Vc is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation, either version 3 of
8 the License, or (at your option) any later version.
10 Vc is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with Vc. If not, see <http://www.gnu.org/licenses/>.
20 #ifndef VC_COMMON_MEMORY_H
21 #define VC_COMMON_MEMORY_H
23 #include "memorybase.h"
27 #include "memoryfwd.h"
34 * Allocates memory on the Heap with alignment and padding.
36 * Memory that was allocated with this function must be released with Vc::free! Other methods might
37 * work but are not portable.
39 * \param n Specifies the number of scalar values the allocated memory must be able to store.
41 * \return Pointer to memory of the requested type and size, or 0 on error.
43 * \warning The standard malloc function specifies the number of Bytes to allocate whereas this
44 * function specifies the number of values, thus differing in a factor of sizeof(T)
49 * \headerfile memory.h <Vc/Memory>
51 template<typename T, Vc::MallocAlignment A>
52 inline ALWAYS_INLINE_L T *ALWAYS_INLINE_R malloc(size_t n)
54 return static_cast<T *>(Internal::Helper::malloc<A>(n * sizeof(T)));
58 * Frees memory that was allocated with Vc::malloc.
61 * \headerfile memory.h <Vc/Memory>
64 inline void ALWAYS_INLINE free(T *p)
66 Internal::Helper::free(p);
69 template<typename V, size_t Size> struct _MemorySizeCalculation
71 enum AlignmentCalculations {
73 AlignmentMask = Alignment - 1,
74 MaskedSize = Size & AlignmentMask,
75 Padding = Alignment - MaskedSize,
76 PaddedSize = MaskedSize == 0 ? Size : Size + Padding
82 * \headerfile memory.h <Vc/Memory>
84 * A helper class for fixed-size two-dimensional arrays.
86 * \param V The vector type you want to operate on. (e.g. float_v or uint_v)
87 * \param Size1 Number of rows
88 * \param Size2 Number of columns
90 template<typename V, size_t Size1, size_t Size2> class Memory : public VectorAlignedBaseT<V>, public MemoryBase<V, Memory<V, Size1, Size2>, 2, Memory<V, Size2> >
93 typedef typename V::EntryType EntryType;
95 typedef MemoryBase<V, Memory<V, Size1, Size2>, 2, Memory<V, Size2> > Base;
96 friend class MemoryBase<V, Memory<V, Size1, Size2>, 2, Memory<V, Size2> >;
97 friend class MemoryDimensionBase<V, Memory<V, Size1, Size2>, 2, Memory<V, Size2> >;
98 enum InternalConstants {
99 PaddedSize2 = _MemorySizeCalculation<V, Size2>::PaddedSize
101 #if defined(VC_ICC) && defined(_WIN32)
102 __declspec(align(__alignof(VectorAlignedBaseT<V>)))
103 #elif defined(VC_CLANG)
104 __attribute__((aligned(__alignof(VectorAlignedBaseT<V>))))
106 EntryType m_mem[Size1][PaddedSize2];
111 VectorsCount = PaddedSize2 / V::Size
115 * \return the number of rows in the array.
117 * \note This function can be eliminated by an optimizing compiler.
119 inline size_t rowsCount() const { return RowCount; }
121 * \return the number of scalar entries in the whole array.
123 * \warning Do not use this function for scalar iteration over the array since there will be
124 * padding between rows if \c Size2 is not divisible by \c V::Size.
126 * \note This function can be optimized into a compile-time constant.
128 inline size_t entriesCount() const { return Size1 * Size2; }
130 * \return the number of vectors in the whole array.
132 * \note This function can be optimized into a compile-time constant.
134 inline size_t vectorsCount() const { return VectorsCount * Size1; }
137 * Copies the data from a different object.
139 * \param rhs The object to copy the data from.
141 * \return reference to the modified Memory object.
143 * \note Both objects must have the exact same vectorsCount().
145 template<typename Parent, typename RM>
146 inline Memory &operator=(const MemoryBase<V, Parent, 2, RM> &rhs) {
147 assert(vectorsCount() == rhs.vectorsCount());
148 std::memcpy(m_mem, rhs.m_mem, vectorsCount() * sizeof(V));
152 * Initialize all data with the given vector.
154 * \param v This vector will be used to initialize the memory.
156 * \return reference to the modified Memory object.
158 inline Memory &operator=(const V &v) {
159 for (size_t i = 0; i < vectorsCount(); ++i) {
165 #if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32)
166 __attribute__((__aligned__(__alignof(VectorAlignedBaseT<V>))))
171 * A helper class to simplify usage of correctly aligned and padded memory, allowing both vector and
176 Vc::Memory<int_v, 11> array;
179 for (size_t i = 0; i < array.entriesCount(); ++i) {
180 int x = array[i]; // read
181 array[i] = x; // write
183 // more explicit alternative:
184 for (size_t i = 0; i < array.entriesCount(); ++i) {
185 int x = array.scalar(i); // read
186 array.scalar(i) = x; // write
190 for (size_t i = 0; i < array.vectorsCount(); ++i) {
191 int_v x = array.vector(i); // read
192 array.vector(i) = x; // write
195 * This code allocates a small array and implements three equivalent loops (that do nothing useful).
196 * The loops show how scalar and vector read/write access is best implemented.
198 * Since the size of 11 is not a multiple of int_v::Size (unless you use the
199 * scalar Vc implementation) the last write access of the vector loop would normally be out of
200 * bounds. But the Memory class automatically pads the memory such that the whole array can be
201 * accessed with correctly aligned memory addresses.
203 * \param V The vector type you want to operate on. (e.g. float_v or uint_v)
204 * \param Size The number of entries of the scalar base type the memory should hold. This
205 * is thus the same number as you would use for a normal C array (e.g. float mem[11] becomes
206 * Memory<float_v, 11> mem).
211 * \headerfile memory.h <Vc/Memory>
213 template<typename V, size_t Size> class Memory<V, Size, 0u> : public VectorAlignedBaseT<V>, public MemoryBase<V, Memory<V, Size, 0u>, 1, void>
216 typedef typename V::EntryType EntryType;
218 typedef MemoryBase<V, Memory<V, Size, 0u>, 1, void> Base;
219 friend class MemoryBase<V, Memory<V, Size, 0u>, 1, void>;
220 friend class MemoryDimensionBase<V, Memory<V, Size, 0u>, 1, void>;
221 enum InternalConstants {
223 AlignmentMask = Alignment - 1,
224 MaskedSize = Size & AlignmentMask,
225 Padding = Alignment - MaskedSize,
226 PaddedSize = MaskedSize == 0 ? Size : Size + Padding
228 #if defined(__INTEL_COMPILER) && defined(_WIN32)
229 __declspec(align(__alignof(VectorAlignedBaseT<V>)))
230 #elif defined(VC_CLANG)
231 __attribute__((aligned(__alignof(VectorAlignedBaseT<V>))))
233 EntryType m_mem[PaddedSize];
238 VectorsCount = PaddedSize / V::Size
242 * \return the number of scalar entries in the whole array.
244 * \note This function can be optimized into a compile-time constant.
246 inline size_t entriesCount() const { return EntriesCount; }
249 * \return the number of vectors in the whole array.
251 * \note This function can be optimized into a compile-time constant.
253 inline size_t vectorsCount() const { return VectorsCount; }
255 template<typename Parent, typename RM>
256 inline Memory<V> &operator=(const MemoryBase<V, Parent, 1, RM> &rhs) {
257 assert(vectorsCount() == rhs.vectorsCount());
258 std::memcpy(m_mem, rhs.m_mem, entriesCount() * sizeof(EntryType));
261 inline Memory<V> &operator=(const EntryType *rhs) {
262 std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType));
265 inline Memory &operator=(const V &v) {
266 for (size_t i = 0; i < vectorsCount(); ++i) {
272 #if defined(VC_ICC) && VC_ICC < 20120212 && !defined(_WIN32)
273 __attribute__((__aligned__(__alignof(VectorAlignedBaseT<V>)) ))
278 * A helper class that is very similar to Memory<V, Size> but with dynamically allocated memory and
284 Vc::Memory<int_v> array(size);
287 for (size_t i = 0; i < array.entriesCount(); ++i) {
292 for (size_t i = 0; i < array.vectorsCount(); ++i) {
293 array.vector(i) = int_v::IndexesFromZero() + i * int_v::Size;
296 * This code allocates a small array with 11 scalar entries
297 * and implements two equivalent loops that initialize the memory.
298 * The scalar loop writes each individual int. The vectorized loop writes int_v::Size values to
299 * memory per iteration. Since the size of 11 is not a multiple of int_v::Size (unless you use the
300 * scalar Vc implementation) the last write access of the vector loop would normally be out of
301 * bounds. But the Memory class automatically pads the memory such that the whole array can be
302 * accessed with correctly aligned memory addresses.
303 * (Note: the scalar loop can be auto-vectorized, except for the last three assignments.)
305 * \note The internal data pointer is not declared with the \c __restrict__ keyword. Therefore
306 * modifying memory of V::EntryType will require the compiler to assume aliasing. If you want to use
307 * the \c __restrict__ keyword you need to use a standard pointer to memory and do the vector
308 * address calculation and loads and stores manually.
310 * \param V The vector type you want to operate on. (e.g. float_v or uint_v)
312 * \see Memory<V, Size>
315 * \headerfile memory.h <Vc/Memory>
317 template<typename V> class Memory<V, 0u, 0u> : public MemoryBase<V, Memory<V, 0u, 0u>, 1, void>
320 typedef typename V::EntryType EntryType;
322 typedef MemoryBase<V, Memory<V>, 1, void> Base;
323 friend class MemoryBase<V, Memory<V>, 1, void>;
324 friend class MemoryDimensionBase<V, Memory<V>, 1, void>;
325 enum InternalConstants {
327 AlignmentMask = Alignment - 1
329 size_t m_entriesCount;
330 size_t m_vectorsCount;
332 size_t calcPaddedEntriesCount(size_t x)
334 size_t masked = x & AlignmentMask;
335 return (masked == 0 ? x : x + (Alignment - masked));
341 * Allocate enough memory to access \p size values of type \p V::EntryType.
343 * The allocated memory is aligned and padded correctly for fully vectorized access.
345 * \param size Determines how many scalar values will fit into the allocated memory.
347 inline Memory(size_t size)
348 : m_entriesCount(size),
349 m_vectorsCount(calcPaddedEntriesCount(m_entriesCount)),
350 m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount))
352 m_vectorsCount /= V::Size;
356 * Copy the memory into a new memory area.
358 * The allocated memory is aligned and padded correctly for fully vectorized access.
360 * \param rhs The Memory object to copy from.
362 template<typename Parent, typename RM>
363 inline Memory(const MemoryBase<V, Parent, 1, RM> &rhs)
364 : m_entriesCount(rhs.entriesCount()),
365 m_vectorsCount(rhs.vectorsCount()),
366 m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount * V::Size))
368 std::memcpy(m_mem, rhs.m_mem, entriesCount() * sizeof(EntryType));
372 * Overload of the above function.
374 * (Because C++ would otherwise not use the templated cctor and use a default-constructed cctor instead.)
376 * \param rhs The Memory object to copy from.
378 inline Memory(const Memory<V, 0u> &rhs)
379 : m_entriesCount(rhs.entriesCount()),
380 m_vectorsCount(rhs.vectorsCount()),
381 m_mem(Vc::malloc<EntryType, Vc::AlignOnVector>(m_vectorsCount * V::Size))
383 std::memcpy(m_mem, rhs.m_mem, entriesCount() * sizeof(EntryType));
387 * Frees the memory which was allocated in the constructor.
389 inline ALWAYS_INLINE ~Memory()
395 * Swap the contents and size information of two Memory objects.
397 * \param rhs The other Memory object to swap.
399 inline void swap(Memory &rhs) {
400 std::swap(m_mem, rhs.m_mem);
401 std::swap(m_entriesCount, rhs.m_entriesCount);
402 std::swap(m_vectorsCount, rhs.m_vectorsCount);
406 * \return the number of scalar entries in the whole array.
408 inline size_t entriesCount() const { return m_entriesCount; }
411 * \return the number of vectors in the whole array.
413 inline size_t vectorsCount() const { return m_vectorsCount; }
416 * Overwrite all entries with the values stored in \p rhs.
418 * \param rhs The object to copy the data from.
420 * \return reference to the modified Memory object.
422 * \note this function requires the vectorsCount() of both Memory objects to be equal.
424 template<typename Parent, typename RM>
425 inline Memory<V> &operator=(const MemoryBase<V, Parent, 1, RM> &rhs) {
426 assert(vectorsCount() == rhs.vectorsCount());
427 std::memcpy(m_mem, rhs.m_mem, entriesCount() * sizeof(EntryType));
432 * Overwrite all entries with the values stored in the memory at \p rhs.
434 * \param rhs The array to copy the data from.
436 * \return reference to the modified Memory object.
438 * \note this function requires that there are entriesCount() many values accessible from \p rhs.
440 inline Memory<V> &operator=(const EntryType *rhs) {
441 std::memcpy(m_mem, rhs, entriesCount() * sizeof(EntryType));
447 * Prefetch the cacheline containing \p addr for a single read access.
449 * This prefetch completely bypasses the cache, not evicting any other data.
451 * \param addr The cacheline containing \p addr will be prefetched.
454 * \headerfile memory.h <Vc/Memory>
456 inline void ALWAYS_INLINE prefetchForOneRead(const void *addr)
458 Internal::Helper::prefetchForOneRead(addr);
462 * Prefetch the cacheline containing \p addr for modification.
464 * This prefetch evicts data from the cache. So use it only for data you really will use. When the
465 * target system supports it the cacheline will be marked as modified while prefetching, saving work
468 * \param addr The cacheline containing \p addr will be prefetched.
471 * \headerfile memory.h <Vc/Memory>
473 inline void ALWAYS_INLINE prefetchForModify(const void *addr)
475 Internal::Helper::prefetchForModify(addr);
479 * Prefetch the cacheline containing \p addr to L1 cache.
481 * This prefetch evicts data from the cache. So use it only for data you really will use.
483 * \param addr The cacheline containing \p addr will be prefetched.
486 * \headerfile memory.h <Vc/Memory>
488 inline void ALWAYS_INLINE prefetchClose(const void *addr)
490 Internal::Helper::prefetchClose(addr);
494 * Prefetch the cacheline containing \p addr to L2 cache.
496 * This prefetch evicts data from the cache. So use it only for data you really will use.
498 * \param addr The cacheline containing \p addr will be prefetched.
501 * \headerfile memory.h <Vc/Memory>
503 inline void ALWAYS_INLINE prefetchMid(const void *addr)
505 Internal::Helper::prefetchMid(addr);
509 * Prefetch the cacheline containing \p addr to L3 cache.
511 * This prefetch evicts data from the cache. So use it only for data you really will use.
513 * \param addr The cacheline containing \p addr will be prefetched.
516 * \headerfile memory.h <Vc/Memory>
518 inline void ALWAYS_INLINE prefetchFar(const void *addr)
520 Internal::Helper::prefetchFar(addr);
527 template<typename V> inline void swap(Vc::Memory<V> &a, Vc::Memory<V> &b) { a.swap(b); }
530 #include "undomacros.h"
532 #endif // VC_COMMON_MEMORY_H